{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c1eede17-886c-4490-9002-06e461415e6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from func.search.search_core import SearchCore\n",
    "# Base URL\n",
    "BASE_URL = \"http://180.184.65.98:38880/atomgit\"\n",
    "\n",
    "# Helper function to call API and print response\n",
    "def call_api(endpoint, params=None):\n",
    "    url = f\"{BASE_URL}/{endpoint}\"\n",
    "    try:\n",
    "        response = requests.get(url, params=params, timeout=10)\n",
    "        if response.status_code == 200:\n",
    "            print(f\"[SUCCESS] {endpoint} - Response:\")\n",
    "            print(response.json())\n",
    "        else:\n",
    "            print(f\"[ERROR] {endpoint} - HTTP {response.status_code}: {response.text}\")\n",
    "    except Exception as e:\n",
    "        print(f\"[EXCEPTION] {endpoint} - {str(e)}\")\n",
    "        return None\n",
    "    return response.json()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f27de82b-d1bd-418a-8942-611f8630e2bf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Testing /metadata API...\n",
      "[SUCCESS] metadata - Response:\n",
      "{'paper_num': 17942, 'chunk_num': 151816, 'track_stats': {'total_track_nums': 49, 'track_counts': {'Conf_Paper_Meta_Data_SIGIR2023_with_whole_text.db': 656, 'Journal_Paper_Meta_Data_IEEE_Transactions_on_Pattern_Analysis_and_Machine_Intelligence_with_whole_text.db': 3275, 'Conf_Paper_Meta_Data_NeurIPS_2023_with_whole_text.db': 950, 'Conf_Paper_Meta_Data_AAAI_2023_with_whole_text.db': 523, 'Conf_Paper_Meta_Data_IJCAI2024_with_whole_text.db': 2583, 'Conf_Paper_Meta_Data_EMNLP_2023_with_whole_text.db': 4938, 'Conf_Paper_Meta_Data_ICML2024_with_whole_text.db': 5623, 'Journal_Paper_Meta_Data_IEEE_Transactions_on_Knowledge_and_Data_Engineering_with_whole_text.db': 901, 'Conf_Paper_Meta_Data_ICLR2024_with_whole_text.db': 13174, 'Conf_Paper_Meta_Data_WWW_2023__with_whole_text.db': 1557, 'Conf_Paper_Meta_Data_IJCAI2023_with_whole_text.db': 2188, 'Conf_Paper_Meta_Data_ECCV2024_with_whole_text.db': 13649, 'Conf_Paper_Meta_Data_SIGIR2024_with_whole_text.db': 1140, 'Conf_Paper_Meta_Data_ICML_2023_with_whole_text.db': 139, 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db': 489, 'Journal_Paper_Meta_Data_Artificial_Intelligence_with_whole_text.db': 446, 'Conf_Paper_Meta_Data_AAAI2024_with_whole_text.db': 11539, 'Conf_Paper_Meta_Data_CVPR_2023_with_whole_text.db': 12242, 'Journal_Paper_Meta_Data_International_Journal_of_Computer_Vision_with_whole_text.db': 434, 'Conf_Paper_Meta_Data_ACL_2023_with_whole_text.db': 4943, 'Conf_Paper_Meta_Data_ICLR_2023_with_whole_text.db': 7290, 'Conf_Paper_Meta_Data_ICCV_2023_with_whole_text.db': 10389, 'Conf_Paper_Meta_Data_CVPR2024_with_whole_text.db': 10804, 'Conf_Paper_Meta_Data_ACL_2022_Annual_Meeting_of_the_Association_for_Computational_Linguistics_with_whole_text.db': 2811, 'Conf_Paper_Meta_Data_ECAI_2023_with_whole_text.db': 482, 'Conf_Paper_Meta_Data_Crypto_2023_with_whole_text.db': 261, 'Conf_Paper_Meta_Data_CCS_2022_with_whole_text.db': 710, 'Conf_Paper_Meta_Data_ECCV_2022_European_Conference_on_Computer_Vision_with_whole_text.db': 4539, 'Conf_Paper_Meta_Data_Crypto_2022_with_whole_text.db': 46, 'Conf_Paper_Meta_Data_CVPR_2022_IEEE_Conference_on_Computer_Vision_and_Pattern_Recognition_with_whole_text.db': 1304, 'Conf_Paper_Meta_Data_EMNLP_2022_Empirical_Methods_in_Natural_Language_Processing_with_whole_text.db': 3079, 'Conf_Paper_Meta_Data_ICML_2022_International_Conference_on_Machine_Learning_with_whole_text.db': 4336, 'Conf_Paper_Meta_Data_ICLR_2022_International_Conference_on_Learning_Representation_with_whole_text.db': 3000, 'Conf_Paper_Meta_Data_ISSTA_2022_with_whole_text.db': 111, 'Conf_Paper_Meta_Data_IJCAI_2022_International_Joint_Conference_on_Artificial_Intelligence_with_whole_text.db': 916, 'Conf_Paper_Meta_Data_MobiCom_2023_with_whole_text.db': 602, 'Conf_Paper_Meta_Data_NeurIPS_2022_Neural_Information_Processing_Systems_with_whole_text.db': 12416, 'Conf_Paper_Meta_Data_KDD2023_with_whole_text.db': 1710, 'Conf_Paper_Meta_Data_SIGIR_2022_Special_Interest_Group_on_Information_Retrieval_with_whole_text.db': 528, 'Conf_Paper_Meta_Data__STOC_2022_with_whole_text.db': 961, 'Conf_Paper_Meta_Data_SP_2022_with_whole_text.db': 397, 'Conf_Paper_Meta_Data_SIGMOD_2023_with_whole_text.db': 9, 'Conf_Paper_Meta_Data_USENIX_Security_2023_with_whole_text.db': 460, 'Conf_Paper_Meta_Data_SP_2023_with_whole_text.db': 335, 'Conf_Paper_Meta_Data_VLDB2023_with_whole_text.db': 111, 'Conf_Paper_Meta_Data_USENIX_Security_2022_with_whole_text.db': 668, 'Conf_Paper_Meta_Data_STOC_2023_with_whole_text.db': 1167, 'Conf_Paper_Meta_Data_VLDB_2022_with_whole_text.db': 215, 'Conf_Paper_Meta_Data_WWW_2022_The_Web_Conference_with_whole_text.db': 770}}}\n"
     ]
    }
   ],
   "source": [
    "# Test database metadata API\n",
    "print(\"Testing /metadata API...\")\n",
    "call_api(\"metadata\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "5cfeb6d1-3a9f-4de4-b2e3-45a16d22ef22",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Testing /search_papers API...\n",
      "[SUCCESS] search_papers - Response:\n",
      "[{'id': 454846633586283990, 'distance': 0.6351577043533325, 'entity': {'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 10, 'chunk_text': '# 5.3 Text2SQL Generation\\nTask Definition. Formatted data such as travel records and stock market transactions are stored in relational databases. Currently, accessing the database requires a data scientist who masters the SQL query language. Our task is to automatically synthesize SQL queries from natural language sentences using machine learning. Compared with the data expert approach, SQL query generation requires deeper reasoning across the structure of the database, the semantics of the structured query language, and the understanding of natural language. As shown in Figure 11, the input of the text2SQL generation is a sentence that describes the query in natural language and the table headers in the relational database. The output is a SQL query with the following structure:  \\n\\nSELECT agg-op sel-col WHERE (cond-col cond-op cond-val) AND ...  \\n\\nHere, SELECT and WHERE are keywords in the SQL language. What we need to predict are: (1) the aggregation operator $\\\\mathsf{a g g-o p}$ , which chooses among the set {empty, COUNT, MIN, MAX, SUM, AVG }; (2) the column name in selection sel-col and (3) the column name in condition cond-col , both of which are chosen from the table headers; (4) the conditional operator cond-op , which is in $\\\\{=,<,>\\\\}$ ; (5) the conditional value cond-val , which is assumed to be a sub-sequence of the given query. Here, one bracket pair () represents one conditional statement. The SQL query may have multiple conditions, which are denoted above by “ ... ”. Figure 11 displays this SQL query:\\n\\n# SELECT COUNT \"School\" WHERE \"No.\" = \"3\"\\nHere agg-op is COUNT ;sel-col is “school”, which is a column name from the table headers. One cond-col is “No.”, which also comes from the table headers. The cond-op is “=”. The cond-val is “3”, which we assume is from the input query. This example has one condition but multiple conditions are allowed.  \\n\\nDefinition of Constraints. Existing generative neural models for this task are not guaranteed to generate a query that follows the grammar of a SQL query. To avoid grammar violations, we compile a set of common SQL grammars as constraints into the Core-Sp module. The Core-Sp module will ensure that all the generated SQL queries follow the grammatical constraints. Our constraints are defined on the operators, namely the conditional operator cond-op and the aggregation operator agg-op . The domains of these operators are dependent upon the data types of the entities (namely, cond-col and sel-col )they operate on. Consider the previous example. The agg-op can only take values between $\\\\{\\\\mathrm{empty,~\\\\coUNT}\\\\}$ , because the sel-col is “school”, which is of the string type. More precisely, let $s$ be a column header (the value of sel-col or cond-col ). We define $F_{a}(s)$ as  \\n\\nInput Table:   \\n\\n\\n<html><body><table><tr><td></td><td>Player</td><td>No.</td><td>Position</td><td>School</td></tr><tr><td>0</td><td>Antonio</td><td>21</td><td>Guard-Forward</td><td>Duke</td></tr><tr><td>1</td><td>Voshon</td><td>2</td><td>Guard</td><td>Minnesota</td></tr><tr><td>2</td><td>Marin</td><td>3</td><td>Guard-Forward</td><td>Butler CC</td></tr></table></body></html>\\n\\n# Input Query:\\nHow many schools did player number 3 play at?\\n\\n# Output SQL Query:\\nFigure 11: An example for the Text2SQL generation task. The input is the text query “How many schools did player number 3 play at?” and the table header “ Player, No., Position, School ” from the relational database. The output should be the SQL query: SELECT COUNT \"School\" WHERE \"No. $\"~=~\"3\"$ .  \\n\\nthe set of aggregation operators agg-op that can be associated with $s$ , and $F_{c}(s)$ as the set of condition operators cond-op that can be associated with $s$ . That is:  \\n\\n$$\\n\\\\begin{array}{r l}&{F_{a}(s)=\\\\left\\\\{\\\\begin{array}{l l}{\\\\{\\\\mathrm{empty~,~\\\\varsigma0UNT,~\\\\forall\\\\mathrm{IIN},~\\\\forall\\\\mathrm{IAX},~\\\\forall\\\\mathrm{II},~\\\\mathrm{AVG}\\\\}}}&{\\\\mathrm{if~}s\\\\mathrm{~of~is~numeric~type}}\\\\\\\\ {\\\\{\\\\mathrm{empty~,~\\\\varsigma0UNT}\\\\}}&{\\\\mathrm{if~}s\\\\mathrm{~of~is~string~type}}\\\\end{array}\\\\right.}\\\\\\\\ &{F_{c}(s)=\\\\left\\\\{\\\\begin{array}{l l}{\\\\{\\\\mathrm{=,~\\\\displaystyle>,~\\\\varsigma\\\\}}}&{\\\\mathrm{if~}s\\\\mathrm{~is~of~numeric~type}}\\\\\\\\ {\\\\{\\\\mathrm{=}\\\\}}&{\\\\mathrm{if~}s\\\\mathrm{~is~of~string~type}}\\\\end{array}\\\\right.}\\\\end{array}\\n$$  \\n\\nWe also introduce dataype constraints, which are defined as:  \\n\\n$$\\n\\\\begin{array}{r l}&{\\\\mathtt{s e l-c o l}=s\\\\Rightarrow\\\\mathtt{a g g-o p}\\\\in F_{a}(s),}\\\\\\\\ &{\\\\mathtt{c o n d-c o l}=s\\\\Rightarrow\\\\mathtt{c o n d-o p}\\\\in F_{c}(s).}\\\\end{array}\\n$$  \\n\\nModel Structure. We embed the Core-Sp module to SQLova (Hwang et al., 2019), the state-of-the-art neural network for text2SQL generation. SQLova has a sequence-tosequence architecture. It first encodes a natural language sentence and the table headers into a high-dimensional vector. Then the decoder of SQLova decodes the hidden representation into the predictions of various entities in the SQL query. SQLova first determines the number of conditions in the SQL query and then fills in the ( cond-col ,cond-op ,cond-val ) for each condition. The operators agg-op, cond-op are predicted as a classification task from a fixed set of operators. Column names cond-col, sel-col are predicted from the set of table headers in the relational database. The cond-val is predicted by a pointer neural network which points at a span of the input natural language sentence. The selected span of the query is used as the cond-val (Dong and Lapata, 2018).  \\n\\nMDD Construction. The associated MDD that encodes the constraints for text2SQL generation is similar to the MDD for if-then program synthesis. The MDD is split into layers and every two layers form a group. One two-layer group is used to enforce constraints on an operator-column name pair. The operator-column name pair can be $\\\\mathsf{a g g-o p}$ and sel-col ,or can be cond-op and cond-col . Note that there can be only one group of $\\\\mathsf{a g g-o p}$ and sel-col and more than one group of cond-op and cond-col . In the first layer of the group, the column name is determined. In the second layer, the invalid operators are ruled out based on the type of the column name selected in the first layer. The two-layer group is copied several times because the SQL query can contain multiple conditions.  \\n\\nConstraint Reasoning Embedded Structured Prediction', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}}, {'id': 454845681760490286, 'distance': 0.6135610938072205, 'entity': {'paper_id': '6535d747939a5f408295c649', 'paper_title': 'Benchmarking and Improving Text-to-SQL Generation under Ambiguity', 'chunk_id': 1, 'chunk_text': '# 2 Background and Related Work\\nA Text-to-SQL model takes as input a question expressed as a natural language text x, and a database schema scomprising of table and column names, and outputs an SQL program ywhich can be executed against the database to answer the user’s question. Figure 1 shows an example. The training data for the task comprises (text, schema, SQL) triplets spanning multiple distinct databases.  \\n\\n  \\nFigure 1: A Text-to-SQL system converts a user question to an SQL query, conditioned on the database schema and/or content.  \\n\\nBenchmarks. Popular benchmarks for the Textto-SQL task are WikiSQL ( Zhong et al. ,2018 )and SPIDER ( Yu et al. ,2018 ). A few others have been proposed recently to capture real-world scenarios, such as KaggleDBQA ( Lee et al. ,2021 ), SEDE ( Hazoom et al. ,2021 ), and EHRSQL ( Lee et al. ,2022 ). They all attach one SQL per text, though some of them mention the problem of ambiguity in real-world datasets. Dr. SPIDER ( Chang et al. ,2023 ), designed to test the robustness of existing models, perturbs either the text or schema of SPIDER but still assigns one SQL per text.  \\n\\nAmbiguity in SQL Although ambiguity has been studied in other fields of NLP ( Pilault et al. ,2023 ;Li et al. ,2022 ;Futeral et al. ,2022 ), it has been unexplored in the context of semantic parsing. Ambiguity in SQL arising from related column names is discussed in ( Wang et al. ,2022 ), but they only consider column ambiguity. Their method of recognizing ambiguous queries depends on labeling words of the text and does not generalize to other kinds of ambiguity. To the best of our discernment, AmbiQT represents the first open benchmark for testing coverage of ambiguous alternatives.  \\n\\nDiverse Decoding. Prior work has critiqued the lack of meaningful diversity in beam-search outputs ( Finkel et al. ,2006 ;Gimpel et al. ,2013 ;Li et al. ,2016 ;Li and Jurafsky ,2016 ). In response, many fixes have been proposed. Some proposals attempt to restrict the tokens sampled, using strategies like Nucleus sampling ( Holtzman et al. ,2020 ), Truncated Sampling ( Hewitt et al. ,2022 ), and Typical Sampling ( Meister et al. ,2023 ), while some rely on Template-Based decoding ( Wiseman et al. ,2018 ;Zhang et al. ,2022 ;Fu et al. ,2023 ;Elgohary et al. ,2020 ;Awasthi et al. ,2022 ). A third approach is to generate a prefix with high diversity first, then generate the rest of the sentence with lower diversity. Narayan et al. (2022 ) follow this recipe but focus on incorporating diverse entity orders in text summarization.  \\n\\n<html><body><table><tr><td rowspan=\"2\">Kind of ambiguity</td><td rowspan=\"2\">Count</td><td colspan=\"3\">Example</td></tr><tr><td>QuestionText</td><td>SQL#1</td><td>SQL#2</td></tr><tr><td>Column Ambiguity (C)</td><td>1240</td><td>List the ids of all students.</td><td>SELECTroll_number FROMstudents</td><td>SELECTadmission_number FROMstudents</td></tr><tr><td>Table Ambiguity (T)</td><td>1417</td><td>How many singers do wehave?</td><td>SELECT COUNT(*) FROM artist</td><td>SELECT COUNT(*) FROM performer</td></tr><tr><td>Join Ambiguity (J)</td><td>288</td><td>Whatarethemakers and models?</td><td>SELECT maker，model FROM model</td><td>SELECT t2.maker，t1.model FROM modelASt1JOINmodel_maker AS t2 ON t1.model_id = t2.model_id</td></tr><tr><td>Precomputed Aggregates (P)</td><td>101</td><td>for each pet type.</td><td>Find the average weight|SELECT AVG(weight)， pettype FROM pets GROUP BY pettype</td><td>SELECT avg_weight，pettype FROM pets_weight</td></tr></table></body></html>\\n\\nTable 1: The AmbiQT benchmark. For each question, we list two valid SQL queries as per the schema. The schema is not shown here, but the ambiguity in it can be inferred based on the two SQL queries.\\n\\n# 3 AmbiQT: A Benchmark of Ambiguous Text-to-SQL Conversion\\nAmbiQT is constructed so that each text query has two distinct valid SQL interpretations. Motivated by our experience working with real-life databases, we designed AmbiQT to encompass four types of ambiguity. Each entry is designed so that both alternatives have a similar relevance to the question, and a well-calibrated decoding method is expected to rank them close by in their outputs.  \\n\\nWe create AmbiQT by modifying the SPIDER (Yu et al. ,2018 ) dataset, and use ChatGPT ( OpenAI ,2022 ) to aid with the creation. In each case, we modify the schema instead of the text as that provides greater control over the modification process. We explain the kinds of ambiguity in AmbiQT below and portray examples of each in Table 1 .  \\n\\nColumn Ambiguity (C). Unlike the SPIDER benchmark where column names usually appear verbatim in the question text (like born state for the column born_state ), when users unaware of the schema pose a natural question, they introduce column ambiguity ( Wang et al. ,2022 ). For example, “ What is the capacity of O2 Arena? ” could be ambiguous if the schema has separate columns for standing and seating capacity. Likewise, a query on the number of under-nourished children is ambiguous if we have different columns for “under-weight children” and “stunted growth in children”.  \\n\\nTo simulate column ambiguity, for each text $\\\\mathbf{x}$ ,schema s, and SQL yin SPIDER, we prompt ChatGPT to generate two synonyms for each column name of sin a one-shot manner. Appendix A furnishes more details of the prompt. We then modify sby replacing $c$ with two columns $c_{1},c_{2}$ , and we use yto generate two queries $\\\\mathbf{y}_{1},\\\\mathbf{y}_{2}$ where all mentions of $c$ are replaced with $c_{1}$ in $\\\\mathbf{y}_{1}$ and with $c_{2}$ in $\\\\mathbf{y}_{2}$ . An example appears in the first row of Table 1 . We do not reuse $c$ because the SPIDER dataset often contains column names verbatim in the question, and that would violate our attempt at keeping the two options at similar relevance levels. We modify one column at a time and generate up to 3 examples from each original entry.  \\n\\nTable Ambiguity (T). Table name ambiguity is common in databases obtained by integrating multiple data sources, as in web tables ( Cafarella et al. ,2008 ;Pimplikar and Sarawagi ,2012 ). Here again, we prompt ChatGPT to generate two alternate names for each table. We then modify one SQL yto generate two candidates ${\\\\bf y}_{1},{\\\\bf y}_{2}$ as shown in Table 1 .  \\n\\nJoin Ambiguity (J). In production databases, a logical table is often vertically partitioned across several tables for efficient clustered access ( Stonebraker et al. ,2019 ). Column names overlapping across tables leads to Join Ambiguity. Suppose we have two tables: (1) person with columns id, name, email_address , and (2) person_details with columns id, postal_address, photo .A question asking for a person’s name and address is ambiguous on whether a JOIN with the person_details is necessary. We expose such ambiguity by modifying the schema as follows.  \\n\\nConsider a $(\\\\mathbf{x},\\\\mathbf{s},\\\\mathbf{y})$ triplet. Suppose yinvolves selecting two or more columns $c_{1},c_{2},\\\\ldots.$ not necessarily in the same order, from a table $t$ . Suppose further that $c_{1}$ is not a primary key of $t$ . We create a table called $t\\\\_c_{1}$ that includes just the primary key $p k_{t}$ of $t$ , and $c_{1}$ . The first alternative $\\\\mathbf{y}_{1}$ is $\\\\mathbf{y}$ and the second alternative $\\\\mathbf{y}_{2}$ uses a join over $t$ and $t\\\\_c_{1}$ , with everything else staying the same as y.  \\n\\n  \\nFigure 2: Beam Search works well when targeting only one output, but leads to superficial diversity, for example via different grouping and erroneous variants of column names.  \\n\\nPrecomputed Aggregates $(\\\\mathbf{P})$ :. This ambiguity is particularly common in data warehouses such as Data Commons which pre-aggregate certain variables. For instance, the “ total rice production ” of a state might refer to the column rice_production of state rather than a sum over it. Text-toSQL models have a bias toward introducing a sum()...group-by clause every time total appears in the text. The non-aggregated alternative is usually missing in the top$k$ options. We incorporate this ambiguity as follows.  \\n\\nFor each $(\\\\mathbf{x},\\\\mathbf{s},\\\\mathbf{y})$ , where $\\\\mathbf{y}$ has at least one aggregate, we construct a new table $t^{\\\\prime}$ . For each aggregate $\\\\boldsymbol{\\\\mathcal{A}}$ over column $c$ in y, we add to $t^{\\\\prime}$ the columns and the columns grouped by in $A^{\\\\prime}\\\\_c$ for all $\\\\mathcal{A}^{\\\\prime}\\\\,\\\\in\\\\,\\\\{\\\\mathsf{a v g},\\\\mathsf{s u m},\\\\mathsf{m i n},\\\\mathsf{m a x}\\\\}$ y. For count $(\\\\star)$ ,we add a column called number . We get two gold queries, the original yand a second with the groupby replaced by a direct SELECT on $t^{\\\\prime}$ as shown in the example in Table 1 . We also support aggregates across multiple tables but skip the details here.', 'original_filename': 'Conf_Paper_Meta_Data_EMNLP_2023_with_whole_text.db'}}, {'id': 454846943921018212, 'distance': 0.588153600692749, 'entity': {'paper_id': '6584feac939a5f4082397b62', 'paper_title': 'Text2Analysis: A Benchmark of Table Question Answering with Advanced Data Analysis and Unclear Queries', 'chunk_id': 2, 'chunk_text': '# 2 Problem Definition\\nWe introduce the Text2Analysis problem as follows: $(t a b l e,\\\\ q u e r y)\\\\to(c o d e,\\\\ r e s u l t)$ . The input consists of a table and a user query . The output consists of Python code snippet(s) and the corresponding result (s). A table has n fields $T\\\\ =\\\\ (f_{1},...,f_{n})$ , and each field consists of a field header and field values. A query is related to data analysis, particularly focusing on advanced analysis $(\\\\S2.1)$ that addresses the shortcomings of existing work and presents a greater and more difficult challenge for models. Additionally, it includes unclear queries $(\\\\S2.2)$ , which are often found in real-world user scenarios and can more effectively evaluate the model’s analytical capabilities.\\n\\n# 2.1 Analysis Operations and Tasks\\nText2Analysis expands the data analysis dataset to advanced analysis tasks. As shown in Figure 2, Data analysis can be divided into descriptive (what happened?), diagnostic (why did it happen?), predictive(what will happen), and prescriptive analytics (what should I do?) (Delen and Ram 2018b). And reporting and visualization may follow each type of analytics. Existing research on table-based data analysis tasks, such as TableQA and Text2SQL (Dong and Lapata 2016; Katsogiannis-Meimarakis and Koutrika 2021), has focused mainly on part of descriptive analytics that can be solved by SQL. They pay insufficient attention to advanced analysis that are beyond the rudimentary operations and require more in-depth analysis.  \\n\\nThe advanced analysis portion of Text2Analysis selects representative tasks from each type of analytics to form the dataset. From descriptive and diagnostic analytics, basic insights are chosen. From predictive analytics, forecasting is selected. And from reporting and visualization, chart generation is chosen. A more detailed introduction to each task will be provided after the following paragraph.  \\n\\nAdvanced analysis, along with rudimentary operations, form the Text2Analysis dataset. They can be combined to form a complex analysis. rudimentary operations and advanced operations (tasks in advanced analysis that output data such as tables and values, that is, tasks excluding reporting and visualization) can be interconnected, and reporting and visualization can be performed subsequently for display.  \\n\\nWe introduce the involved tasks one by one as follows:  \\n\\n1. Rudimentary Operations : These operations encompass a set of functions and procedures that can be executed using the Structured Query Language (SQL) (Date 1989). Their primary purpose is to enable users to perform data management, manipulation, and transformation on multidimensional structured data. The main operations include group by, aggregation, filter, sort, and so on.  \\n\\n2. Basic Insights : In the context of a multi-dimensional dataset, an insight represents an interesting observation about a particular subject from a specific perspective (Ding et al. 2019; Ma et al. 2021; Chen, Yang, and Ribarsky 2009). Text2Analysis incorporates seven commonly insights:  \\n\\n• Rank: Within a group comprising multiple values, the highest value significantly exceeds all other values.  \\n\\n• RankLast: Within a group comprising multiple values, the lowest value is notably smaller than all other values.  \\n\\n• Attribution: In a group of multiple non-negative values, the highest value is equal to or larger than the sum of all other values.  \\n\\n• Trend: A time series (segment) exhibits an increasing or decreasing trend.  \\n\\n• Monotonicity: A time series (segment of) exhibits a consistent and unidirectional increasing or decreasing trend.  \\n\\n• Outlier: A time series contains outliers, which deviate significantly from the trend compared to the majority of points and their neighbors.  \\n\\n• Unimodality: A (segment of) time series exhibits an unimodal distribution, characterized by a single peak or turning point, and may display U-shaped patterns.  \\n\\n3. Forecasting : Forecasting involves predicting future trends and outcomes by analyzing historical data using statistical methods, machine learning algorithms, and time series models (Taylor and Letham 2018; Hosseini et al. 2021). This process identifies patterns and relationships within the data, enabling informed predictions about future events.  \\n\\n4. Chart Generation : Chart generation refers to the recommendation and construction of prevalent charts derived from a given table (Moritz et al. 2019; Luo et al. 2018; Zhou et al. 2021).  \\n\\nWe choose commonly used Python libraries for each task as follows, to address the corresponding analysis query:  \\n\\n• Rudimentary Operations: Pandas 1 (APIs excluding plot$t i n g^{2},$ ).  \\n\\n• Each task of Basic Insights: Custom functions are implemented to perform the mentioned tasks, and provide results for evaluation.  \\n\\n• Forecasting: Greykite 3 (Forecaster ), Prophet 4 (Prophet ).   \\n• Chart Generation: Matplotlib 5 (pyplot ).\\n\\n# 2.2 Unclear Queries\\nIn many real situations, users do not directly provide complete queries, but rather give queries with some unclear intents. There are various ways to address them, such as recommending completions for the missing intents or guiding users to complete the query. This paper focuses on proposing a benchmark and does not explore the solution methods in depth. We only use the model for recommendations, which can also satisfy the exploration of the next purpose.  \\n\\nSecondly, the analysis and recommendation capabilities of large language models can be explored through unclear queries. When recommending for unclear queries, the model not only needs to possess semantic parsing capabilities but also requires analytical recommendation capabilities. Exploring these capabilities of large language models is crucial for better utilizing them in the analytical domain.  \\n\\nAn unclear query lacks the essential information required to perform tasks. In other words, in the query, there are missing parameters for generating the analysis code which consists of operations from the chosen libraries. Since the same task may require different parameters in different libraries, we have combined the representatively used libraries for each task in $\\\\S2.1$ and selected the essential parameters as shown in Table 1. Some parameters are not provided for missing parameters as follows:  \\n\\n• When a parameter is absent, the associated operator will be excluded from use. The parameters for this scenarios are dimension field for rudimentary operations, filter condition for rudimentary operations, insight type for basic insights.  \\n\\n• Parameters are typically not mentioned in the query or possess standard default values. The parameters for similar scenarios are confidence for forecasting, $p$ -value for basic insights, measure aggregation for basic insights.  \\n\\nIn addition to missing parameters, there are other types of unclear queries, such as, ambiguous parameters, unclear tasks. For ambiguous parameters, a query may have all parameters provided, but they are ambiguous or vague. E.g. , a table has two fields, UnitPrice and TotalPrice, but the query only mentions “price”, resulting in ambiguity. Another example is when a query mentions filtering “young people”, but there is no universally accepted definition of “young”, leading to varied age filters. There are more details in (Wang et al. 2023), and we will not discuss this further in this paper.  \\n\\nTable 1: Taxonomy and Examples of Unclear Queries   \\n\\n\\n<html><body><table><tr><td>Tasks</td><td>Parameters</td><td>Meanings of Parameters and Missing Parameters Query</td></tr><tr><td rowspan=\"3\">Rudimentary Operations</td><td>clear</td><td>E.g., Which brand has the highest total sales in 2023?</td></tr><tr><td>field (msr_field)</td><td>Measure field for sort or aggregation. E.g., Which brand had the best overall in 2023?</td></tr><tr><td>agg (agg-func)</td><td>Aggregation function, such as sum, average... E.g., Which brand has the highest sales in 2023?</td></tr><tr><td rowspan=\"2\">Basic Insights</td><td>clear</td><td>E.g., Does total increase over time?</td></tr><tr><td>field</td><td>Field for the insight. E.g., Is there an increase over time?</td></tr><tr><td rowspan=\"3\">Forecasting</td><td>clear</td><td>E.g., Forecast the cost data of different brands, categories and models of cars in 2012.</td></tr><tr><td>forecast field</td><td>Measure field used for forecasting. E.g., What will be for different categories and models of cars in 2012?</td></tr><tr><td>steps / freq</td><td>Forecasting steps and/or frequency. E.g., What will be the sales and cost data of different brands, categories and models of cars?</td></tr><tr><td rowspan=\"4\">Visualization</td><td>clear</td><td>E.g., Help me create a bar chart to visualize the Frequency field for the HH field.</td></tr><tr><td>chart type</td><td>Char type, including lineChart, barChart, scatterChart, pieChart. E.g., Help me create a chart to visualize the Frequency field for the HH field.</td></tr><tr><td>x fields</td><td>Fields for x-axis. E.g., Help me create a bar chart to visualize the Frequency field.</td></tr><tr><td>y fields</td><td>Fields for y-axis. E.g., Help me create a bar chart to visualize for the HH field.</td></tr></table></body></html>  \\n\\nFor unclear tasks, a query does not explicitly specify what task to use for analysis, e.g. , “What should I do if I want to get more profits”. This query only proposes a goal without specifying any tasks, and solving such problems requires stronger problem-solving abilities. In this work, we will not discuss this further and will consider it as future work.', 'original_filename': 'Conf_Paper_Meta_Data_AAAI2024_with_whole_text.db'}}, {'id': 454845641360425782, 'distance': 0.5863323211669922, 'entity': {'paper_id': '6461b9c9d68f896efad43133', 'paper_title': 'Interactive Text-to-SQL Generation Via Editable Step-by-Step Explanations', 'chunk_id': 1, 'chunk_text': '# 2 Related Work\\n\\n# 2.1 Text-to-SQL Generation\\nNatural language interfaces have long been recognized as a way to expand access to databases ( Hendrix et al. ,1978 ).The construction of several large text-to-SQL datasets, such as WikiSQL ( Zhong et al. ,2017 ) and Spider ( Yu et al. ,2018 ), has enabled the adoption of deep learning models in this task, achieving unprecedented performance in recent years ( Rubin and Berant ,2021 ;Wang et al. ,2020a ;Scholak et al. ,2021 ;Yu et al. ,2020 ;Hwang et al. ,2019 ). Our technique is based on the recent success of neural text-to-SQL models. Unlike existing models that perform end-to-end SQL generation, we propose a new interaction mechanism for users to validate and refine generated queries through step-by-step explanations.  \\n\\nAs the first step to demonstrate the feasibility of our approach, we focus on single-turn SQL generation ( Yu et al. ,2018 ) in this work. There has also been recent work that supports multi-turn SQL generation ( Yu et al. ,2019a ,b;Guo et al. ,2021 ), where a sequence of interdependent queries are expressed in multiple utterances in a dialog. Models designed for multi-turn SQL generation typically need to reason about the dialog context and effectively encode the historical queries ( Wang et al. ,2021 ;Hui et al. ,2021 ;Zhang et al. ,2019 ;Cai and Wan ,2020 ;Wang et al. ,2020b ). Our approach can be extended to support multi-turn SQL generation by initiating separate refinement sessions for individual queries while incorporating the contextual information of previous queries into explanation generation and text-to-clause generation.\\n\\n# 2.2 Interactive Semantic Parsing for SQL\\nRecently, there has been a growing interest in interactive approaches that elicit user feedback to guide SQL generation. Iyer et al. (2017 ) proposed to allow users to flag incorrect queries and continuously retrain the model. Both DIY ( Narechania et al. ,2021 ) and NaLIR ( Li and Jagadish ,2014a ,b)enable users to select alternative values or subexpressions to fix an incorrect SQL query. PIIA ( Li et al. ,2020 ), MISP ( Yao et al. ,2019 ), and DialSQL ( Gur et al. ,2018 ) proactively ask for user feedback via multiple-choice questions. A common limitation of these methods is that they only solicit feedback in constrained forms, hindering their flexibility and effectiveness in addressing the variability of SQL errors. In contrast, our approach allows more flexible feedback through direct edits to the explanations generated by the model.  \\n\\nThe only work that supports open-ended user feedback in SQL generation is NL-EDIT ( Elgohary et al. ,2021 ). NL-EDIT is trained on SPLASH ( Elgohary et al. ,2020 ), a dataset of SQL errors and user feedback utterances. Given an incorrect query, NL-EDIT allows users to provide a clarification utterance. Based on the utterance, the model generates a sequence of edits to the SQL query. Incorporating feedback expressed in a completely free-text utterance is challenging for two reasons:  \\n\\n  \\nFigure 2: An Overview of Interactive SQL Generation and Refinement with Editable Step-by-Step Explanations  \\n\\n(1) the model needs to infer which part of the SQL query to fix; (2) the model needs to determine what changes are being requested. In contrast, S TEPS asks users to directly edit an NL explanation and make corrections to the explanation. Comparing the initial explanation with the user-corrected explanation makes it easier to locate the part of a SQL query that needs to be changed and infer what change to make.  \\n\\nThe idea of SQL decomposition is similar to recent work that decomposes a user question to sub-questions on SPARQL ( Mo et al. ,2022 ). Their approach requires a crowd-sourced dataset to train a question decomposition model. In contrast, our rule-based method generates step-by-step explanations without the need for training a model. This also allows our system to map each entity in the explanation to the corresponding SQL element, making it easier for SQL correction (Sec. 3.2 ).\\n\\n# 2.3 Explaining SQL Queries in NL\\nOur approach is also related to prior work that generates NL explanations for SQL queries. Simitsis and Ioannidis (2009 ) argued that databases should “talk back” in human language so that users can verify results. Kokkalis et al. (2012 ) and Koutrika et al. (2010 ) used a graph-based SQL translation approach, where each query is represented as a graph and the explanation is generated by traversing the graph. Elgohary et al. (2021 ,2020 ) employed a template-based explanation approach, where they manually curated 57 templates for explanation generation. These approaches have limited capability to handle arbitrary SQL queries. To address this limitation, we propose a rule-based method to first explain terminal tokens (e.g., operators, keywords) and gradually compose them into a complete explanation based on the derivation rules in the SQL grammar. Another key difference is that none of the existing approaches supports editable explanations for SQL correction, which is a key feature to solicit user feedback in our approach.', 'original_filename': 'Conf_Paper_Meta_Data_EMNLP_2023_with_whole_text.db'}}, {'id': 454919307559982250, 'distance': 0.5849090814590454, 'entity': {'paper_id': '63608e5090e50fcafdee1152', 'paper_title': 'Diverse Parallel Data Synthesis for Cross-Database Adaptation of   Text-to-SQL Parsers', 'chunk_id': 0, 'chunk_text': '# Diverse Parallel Data Synthesis for Cross-Database Adaptation of Text-to-SQL Parsers\\nAbhijeet Awasthi Ashutosh Sathe Sunita Sarawagi Indian Institute of Technology Bombay, India {awasthi,absathe,sunita}@cse.iitb.ac.in\\n\\n# Abstract\\nText-to-SQL parsers typically struggle with databases unseen during the train time. Adapting parsers to new databases is a challenging problem due to the lack of natural language queries in the new schemas. We present REFILL , a framework for synthesizing highquality and textually diverse parallel datasets for adapting a Text-to-SQL parser to a target schema. REFILL learns to retrieve-and-edit text queries from the existing schemas and transfers them to the target schema. We show that retrieving diverse existing text, masking their schema-specific tokens, and refilling with tokens relevant to the target schema, leads to significantly more diverse text queries than achievable by standard SQL-to-Text generation methods. Through experiments spanning multiple databases, we demonstrate that fine-tuning parsers on datasets synthesized using R EFILL consistently outperforms the prior data-augmentation methods.\\n\\n# 1 Introduction\\nNatural Language interface to Databases (NLIDB) that translate text queries to executable SQLs is a challenging task in the field of Semantic Parsing ( Zelle and Mooney ,1996 ;Zettlemoyer and Collins ,2005 ;Berant et al. ,2013 ). In addition to understanding the natural language and generating an executable output, Text-to-SQL also requires the ability to reason over the schema structure of relational databases. Recently, datasets such as Spider ( Yu et al. ,2018 ) comprising of parallel (Text,SQL) pairs over hundreds of schemas have been released, and these have been used to train state-of-the-art neural Text-to-SQL models ( Wang et al. ,2020 ;Scholak et al. ,2021a ;Rubin and Berant ,2021 ;Scholak et al. ,2021b ;Xu et al. ,2021 ). However, several studies have independently shown that such Text-to-SQL models fail catastrophically when evaluated on unseen schemas from the real-world databases ( Suhr et al. ,2020 ;  \\n\\nLee et al. ,2021 ;Hazoom et al. ,2021 ). Adapting existing parsers to new schemas is challenging due to the lack of parallel data for fine-tuning the parser.  \\n\\nSynthesizing parallel data, that is representative of natural human generated queries ( Wang et al. ,2015 ;Herzig and Berant ,2019 ), is a long-standing problem in semantic parsing. Several methods have been proposed for supplementing with synthetic data, ranging from grammar-based canonical queries to full-fledged conditional text generation models ( Wang et al. ,2015 ;Herzig and Berant ,2019 ;Zhong et al. ,2020 ;Yang et al. ,2021 ;Zhang et al. ,2021 ;Wang et al. ,2021 ). For Text-to-SQL, data-augmentation methods are primarily based on training an SQL-to-Text model using labeled data from pre-existing schemas, and generating data in the new schemas. We show that the text generated by these methods, while more natural than canonical queries, lacks the rich diversity of natural multi-user queries. Fine-tuning with such data often deteriorates the model performance since the lack of diversity leads to a biased model.  \\n\\nWe propose a framework called R EFILL (§ 2 )for generating diverse text queries for a given SQL workload that is often readily available ( Baik et al. ,2019 ). R EFILL leverages parallel datasets from several existing schemas, such as Spider ( Yu et al. ,2018 ), to first retrieve a diverse set of text paired with SQLs that are structurally similar to a given SQL $q$ (§ 2.1 ). Then, it trains a novel schema translator model for converting the text of the training schema to the target schema of $q$ . The schema translator is decomposed into a mask and fill step to facilitate training without direct parallel examples of schema translation. Our design of the mask module and our method of creating labeled data for the fill module entails non-trivial details that we explain in this paper $\\\\left(\\\\S\\\\ 2.2\\\\right)$ . R E-FILL also incorporates a method of filtering-out inconsistent (Text,SQL) pairs using an independent binary classifier $\\\\left(\\\\S\\\\,2.3\\\\right)$ , that provides more useful quality scores, than the cycle-consistency based filtering ( Zhong et al. ,2020 ). Our approach is related to retrieve-and-edit models that have been used for semantic parsing ( Hashimoto et al. ,2018 ), dialogue generation ( Chi et al. ,2021 ), translation ( Cai et al. ,2021 ), and question answering ( Karpukhin et al. ,2020 ). However, our method of casting the \"edit\" as a two-step mask-and-fill schema translation model is different from the prior work.  \\n\\nWe summarize our contributions as follows: (i) We propose the idea of retrieving and editing natural text from several existing schemas for transferring it to a target schema, obtaining higher text diversity compared to the standard SQL-to-Text generators. (ii) We design strategies for masking schema-specific words in the retrieved text and training the R EFILL model to fill in the masked positions with words relevant to the target schema. (iii) We filter high-quality parallel data using a binary classifier and show that it is more efficient than existing methods based on cycle-consistency filtering. (iv) We compare R EFILL with prior dataaugmentation methods across multiple schemas and consistently observe that fine-tuning Text-toSQL parsers on data generated by R EFILL leads to more accurate adaptation.', 'original_filename': 'Conf_Paper_Meta_Data_EMNLP_2022_Empirical_Methods_in_Natural_Language_Processing_with_whole_text.db'}}, {'id': 454845641470001986, 'distance': 0.5768881440162659, 'entity': {'paper_id': '6461b9c9d68f896efad43133', 'paper_title': 'Interactive Text-to-SQL Generation Via Editable Step-by-Step Explanations', 'chunk_id': 7, 'chunk_text': '# 10 Ethical Consideration\\nThe interactive text-to-SQL system proposed by this work poses minimal risks to human users and society. Instead, it will significantly lower the barrier of querying database systems and empower a great number of people, especially those without technical backgrounds, to access and analyze data. To evaluate the usability of our system, we conducted a human-subject study with real users. To minimize the risks to human subjects, we strictly followed the community standards with the approval from the Purdue University IRB office. In the recruitment email, we shared a consent form that includes detailed information about the study Yujian Gan, Xinyun Chen, Qiuping Huang, Matthew Purver, John R. Woodward, Jinxia Xie, and Pengsheng Huang. 2021a. Towards robustness of textto-SQL models against synonym substitution . In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers) , pages 2505– 2515, Online. Association for Computational Linguistics.  \\n\\nYujian Gan, Xinyun Chen, and Matthew Purver. 2021b. Exploring underexplored limitations of cross-domain text-to-SQL generalization . In Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing , pages 8926–8931, Online and Punta Cana, Dominican Republic. Association for Computational Linguistics.  \\n\\nDawei Gao, Haibin Wang, Yaliang Li, Xiuyu Sun, Yichen Qian, Bolin Ding, and Jingren Zhou. 2023. Text-to-sql empowered by large language models: A benchmark evaluation .arXiv .  \\n\\nAlessandra Giordani and Alessandro Moschitti. 2012. Translating questions to SQL queries with generative parsers discriminatively reranked . In Proceedings of COLING 2012: Posters , pages 401–410, Mumbai, India. The COLING 2012 Organizing Committee.  \\n\\nJiaqi Guo, Ziliang Si, Yu Wang, Qian Liu, Ming Fan, Jian-Guang Lou, Zijiang Yang, and Ting Liu. 2021. Chase: A large-scale and pragmatic Chinese dataset for cross-database context-dependent text-to-SQL .In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers) , pages 2316–2331, Online. Association for Computational Linguistics.  \\n\\nIzzeddin Gur, Semih Yavuz, Yu Su, and Xifeng Yan. 2018. DialSQL: Dialogue based structured query generation . In Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 1339–1349, Melbourne, Australia. Association for Computational Linguistics.  \\n\\nGary G. Hendrix, Earl D. Sacerdoti, Daniel Sagalowicz, and Jonathan Slocum. 1978. Developing a natural language interface to complex data .ACM Trans. Database Syst. , 3(2):105–147.  \\n\\nBinyuan Hui, Ruiying Geng, Qiyu Ren, Binhua Li, Yongbin Li, Jian Sun, Fei Huang, Luo Si, Pengfei Zhu, and Xiaodan Zhu. 2021. Dynamic hybrid relation network for cross-domain context-dependent semantic parsing .CoRR , abs/2101.01686.  \\n\\nWonseok Hwang, Jinyeong Yim, Seunghyun Park, and Minjoon Seo. 2019. A comprehensive exploration on wikisql with table-aware word contextualization .In KR2ML Workshop at NeurIPS .  \\n\\nSrinivasan Iyer, Ioannis Konstas, Alvin Cheung, Jayant Krishnamurthy, and Luke Zettlemoyer. 2017. Learning a neural semantic parser from user feedback . In Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) , pages 963–973, Vancouver, Canada. Association for Computational Linguistics.  \\n\\nAndreas Kokkalis, Panagiotis Vagenas, Alexandros Zervakis, Alkis Simitsis, Georgia Koutrika, and Yannis Ioannidis. 2012. Logos: A system for translating queries into narratives . In Proceedings of the 2012 ACM SIGMOD International Conference on Management of Data , SIGMOD ’12, page 673–676, New York, NY, USA. Association for Computing Machinery.  \\n\\nGeorgia Koutrika, Alkis Simitsis, and Yannis E. Ioannidis. 2010. Explaining structured queries in natural language . In 2010 IEEE 26th International Conference on Data Engineering (ICDE 2010) , pages 333–344.  \\n\\nFei Li and H. V. Jagadish. 2014a. Constructing an interactive natural language interface for relational databases.Proc. VLDB Endow., 8(1):73–84.  \\n\\nFei Li and Hosagrahar V Jagadish. 2014b. Nalir: An interactive natural language interface for querying relational databases . In Proceedings of the 2014 ACM SIGMOD International Conference on Management of Data , SIGMOD ’14, page 709–712, New York, NY, USA. Association for Computing Machinery.  \\n\\nHaoyang Li, Jing Zhang, Cuiping Li, and Hong Chen. 2023a. RESDSQL: Decoupling schema linking and skeleton parsing for text-to-sql .In Proceedings of the Thirty-Seventh AAAI Conference on Artificial Intelligence and Thirty-Fifth Conference on Innovative Applications of Artificial Intelligence and Thirteenth Symposium on Educational Advances in Artificial Intelligence , AAAI’23/IAAI’23/EAAI’23. AAAI Press.  \\n\\nJinyang Li, Binyuan Hui, Reynold Cheng, Bowen Qin, Chenhao Ma, Nan Huo, Fei Huang, Wenyu Du, Luo Si, and Yongbin Li. 2023b. Graphix-t5: Mixing pre-trained transformers with graph-aware layers for text-to-sql parsing .In Proceedings of the ThirtySeventh AAAI Conference on Artificial Intelligence and Thirty-Fifth Conference on Innovative Applications of Artificial Intelligence and Thirteenth Symposium on Educational Advances in Artificial Intelligence , AAAI’23/IAAI’23/EAAI’23. AAAI Press.  \\n\\nYuntao Li, Bei Chen, Qian Liu, Yan Gao, Jian-Guang Lou, Yan Zhang, and Dongmei Zhang. 2020. “what do you mean by that?” a parser-independent interactive approach for enhancing text-to-SQL . In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 6913–6922, Online. Association for Computational Linguistics.  \\n\\nLingbo Mo, Ashley Lewis, Huan Sun, and Michael White. 2022. Towards transparent interactive semantic parsing via step-by-step correction . In Findings of the Association for Computational Linguistics: ACL 2022 , pages 322–342, Dublin, Ireland. Association for Computational Linguistics.  \\n\\nArpit Narechania, Adam Fourney, Bongshin Lee, and Gonzalo Ramos. 2021. Diy: Assessing the correctness of natural language to sql systems .In 26th International Conference on Intelligent User Interfaces , IUI ’21, page 597–607, New York, NY, USA. Association for Computing Machinery.  \\n\\nSaul B. Needleman and Christian D. Wunsch. 1970. A general method applicable to the search for similarities in the amino acid sequence of two proteins .Journal of Molecular Biology , 48(3):443–453.  \\n\\nAxel-Cyrille Ngonga Ngomo, Lorenz Bühmann, Christina Unger, Jens Lehmann, and Daniel Gerber. 2013. Sparql2nl: Verbalizing sparql queries .In Proceedings of the 22nd International Conference on World Wide Web , WWW ’13 Companion, page 329–332, New York, NY, USA. Association for Computing Machinery.  \\n\\nZheng Ning, Zheng Zhang, Tianyi Sun, Yuan Tian, Tianyi Zhang, and Toby Jia-Jun Li. 2023. An empirical study of model errors and user error discovery and repair strategies in natural language database queries . In Proceedings of the 28th International Conference on Intelligent User Interfaces , IUI $^{\\\\prime}23$ ,page 633–649, New York, NY, USA. Association for Computing Machinery.  \\n\\nAna-Maria Popescu, Alex Armanasu, Oren Etzioni, David Ko, and Alexander Yates. 2004. Modern natural language interfaces to databases: Composing statistical parsing with semantic tractability . In COLING 2004: Proceedings of the 20th International Conference on Computational Linguistics , pages 141– 147, Geneva, Switzerland. COLING.  \\n\\nOhad Rubin and Jonathan Berant. 2021. SmBoP: Semiautoregressive bottom-up semantic parsing . In Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies , pages 311–324, Online. Association for Computational Linguistics.  \\n\\nTorsten Scholak, Nathan Schucher, and Dzmitry Bahdanau. 2021. PICARD: Parsing incrementally for constrained auto-regressive decoding from language models . In Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing ,pages 9895–9901, Online and Punta Cana, Dominican Republic. Association for Computational Linguistics.  \\n\\nAlkis Simitsis and Yannis Ioannidis. 2009. Dbmss should talk back too . In 10.48550/ARXIV.0909.1786 .arXiv.  \\n\\nYu Su, Ahmed Hassan Awadallah, Madian Khabsa, Patrick Pantel, Michael Gamon, and Mark Encarnacion. 2017. Building natural language interfaces to web apis . In Proceedings of the 2017 ACM on Conference on Information and Knowledge Management ,CIKM ’17, page 177–186, New York, NY, USA. Association for Computing Machinery.  \\n\\nBailin Wang, Richard Shin, Xiaodong Liu, Oleksandr Polozov, and Matthew Richardson. 2020a. RATSQL: Relation-aware schema encoding and linking for text-to-SQL parsers . In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics , pages 7567–7578, Online. Association for Computational Linguistics.  \\n\\nRunze Wang, Zhen-Hua Ling, Jingbo Zhou, and Yu Hu. 2020b. Tracking interaction states for multi-turn textto-sql semantic parsing .CoRR , abs/2012.04995.  \\n\\nXiaxia Wang, Sai Wu, Lidan Shou, and Ke Chen. 2021. An interactive nl2sql approach with reuse strategy . In Database Systems for Advanced Applications: 26th International Conference, DASFAA 2021, Taipei, Taiwan, April 11–14, 2021, Proceedings, Part II , page 280–288, Berlin, Heidelberg. Springer-Verlag.  \\n\\nZiyu Yao, Yu Su, Huan Sun, and Wen-tau Yih. 2019. Model-based interactive semantic parsing: A unified framework and a text-to-SQL case study . In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP) , pages 5447–5458, Hong Kong, China. Association for Computational Linguistics.  \\n\\nTao Yu, Chien-Sheng Wu, Xi Victoria Lin, Bailin Wang, Yi Chern Tan, Xinyi Yang, Dragomir R. Radev, Richard Socher, and Caiming Xiong. 2020. Grappa: Grammar-augmented pre-training for table semantic parsing .CoRR , abs/2009.13845.  \\n\\nTao Yu, Rui Zhang, Heyang Er, Suyi Li, Eric Xue, Bo Pang, Xi Victoria Lin, Yi Chern Tan, Tianze Shi, Zihan Li, Youxuan Jiang, Michihiro Yasunaga, Sungrok Shim, Tao Chen, Alexander Fabbri, Zifan Li, Luyao Chen, Yuwen Zhang, Shreya Dixit, Vincent Zhang, Caiming Xiong, Richard Socher, Walter Lasecki, and Dragomir Radev. 2019a. CoSQL: A conversational text-to-SQL challenge towards crossdomain natural language interfaces to databases . In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP) , pages 1962– 1979, Hong Kong, China. Association for Computational Linguistics.  \\n\\nTao Yu, Rui Zhang, Kai Yang, Michihiro Yasunaga, Dongxu Wang, Zifan Li, James Ma, Irene Li, Qingning Yao, Shanelle Roman, Zilin Zhang, and Dragomir Radev. 2018. Spider: A large-scale human-labeled dataset for complex and cross-domain semantic parsing and text-to-SQL task . In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing , pages 3911–3921, Brussels, Belgium. Association for Computational Linguistics.  \\n\\nTao Yu, Rui Zhang, Michihiro Yasunaga, Yi Chern Tan, Xi Victoria Lin, Suyi Li, Heyang Er, Irene Li, Bo Pang, Tao Chen, Emily Ji, Shreya Dixit, David Proctor, Sungrok Shim, Jonathan Kraft, Vincent Zhang, Caiming Xiong, Richard Socher, and Dragomir Radev. 2019b. SParC: Cross-domain semantic parsing in context . In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics , pages 4511–4523, Florence, Italy. Association for Computational Linguistics.  \\n\\nRui Zhang, Tao Yu, Heyang Er, Sungrok Shim, Eric Xue, Xi Victoria Lin, Tianze Shi, Caiming Xiong, Richard Socher, and Dragomir Radev. 2019. Editingbased SQL query generation for cross-domain context-dependent questions . In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP) , pages 5338–5349, Hong Kong, China. Association for Computational Linguistics.  \\n\\nYiyun Zhao, Jiarong Jiang, Yiqun Hu, Wuwei Lan, Henry Zhu, Anuj Chauhan, Alexander Li, Lin Pan, Jun Wang, Chung-Wei Hang, Sheng Zhang, Marvin Dong, Joe Lilien, Patrick Ng, Zhiguo Wang, Vittorio Castelli, and Bing Xiang. 2022. Importance of synthesizing high-quality data for text-to-sql parsing .  \\n\\nVictor Zhong, Caiming Xiong, and Richard Socher. 2017. Seq2sql: Generating structured queries from natural language using reinforcement learning . In arxiv preprint, arxiv/1709.00103. arXiv.  \\n\\nA SQL Grammar and Translation Rules   \\nTable 6: A Simplified SQL Grammar   \\n\\n\\n<html><body><table><tr><td>(sql〉:=SELECT（nouns〉〈sub> I〈sql〉INTERSECT〈sql> (bs〉NOINn〈[bs>1 I〈sql>EXCEPT〈sql> (sub）:= ∈ IFROM〈noun）〈sub></td></tr><tr><td>IWHERE（condition）〈sub> |JOIN（noun）ON（condition>（sub） IGROUP BY〈noun〉〈sub> |HAVING（condition）〈sub） |ORDER BY〈noun）〈sorting）〈sub> I LIMIT NUM <nouns）:=DISTINCT（nouns</td></tr><tr><td>I〈noun >,(nouns ) I(noun) I〈func）（〈noun）) （condition）:=〈noun）〈op）NUM I〈noun）〈op〉〈noun）</td></tr><tr><td>I(noun）〈op>〈sql> IBETWEEN〈noun）AND （noun） |（condition）AND（condition） |（condition）OR（condition） INOT〈condition)</td></tr><tr><td><sorting）:=ASC丨DESC|∈ （func）:= COUNT丨AVGIMAXIMINI SUM</td></tr><tr><td><op>:=>=|<=l>|<l=l!=</td></tr><tr><td><noun）:=STRINGISTRING.STRING|*</td></tr></table></body></html>  \\n\\nTable 6 shows a simplified version of the SQL grammar. In this grammar, italicized text with angle brackets, such as terminals which can be further expanded based $\\\\langle s q l\\\\rangle$ , represents nonon derivation rules. Text without brackets, such as the SELECT keyword, represents terminals that cannot be further expanded. Using the derivation rules in Table 6 , S TEPS decomposes a SQL query into 6 types of SQL clauses: (1) FROM-JOIN-ON ,(2) WHERE , (3) GROUP BY , (4) HAVING , (5) ORDER BY , (6) SELECT . We do not separate the JOIN clause from the FROM clause, since it is easier to translate them together. For nested queries with INTERSECT ,UNION ,EXCEPT ,NOT IN keywords, S TEPS first decomposes them into subqueries and then decompose each subquery to the 6 types of clauses above.  \\n\\nSTEPS translates each SQL clause to a natural language explanation based on translation rules and templates in Table 7 and Table 8 . Table 7 shows the translation rules for individual SQL tokens, e.g., keywords, operators, built-in functions, etc. Specifically, {col} and $\\\\{\\\\mathrm{T}\\\\}$ mean translating a column or table name to a more readable name. We pre-defined mapping between each table and column in a database to a more readable name. Such a mapping can be easily defined based on the database schema and only needs to be defined once. If no such mapping is available, S TEPS will reuse the same column/table name as defined in the database schema. Table 8 shows the translation templates for nested queries. The T RANSLATE function means recursively invoking the explanation generation method on the subquery.', 'original_filename': 'Conf_Paper_Meta_Data_EMNLP_2023_with_whole_text.db'}}, {'id': 454845728659891218, 'distance': 0.5735474824905396, 'entity': {'paper_id': '6576dccf939a5f40821c2429', 'paper_title': 'TrojanSQL: SQL Injection Against Natural Language Interface to Database', 'chunk_id': 0, 'chunk_text': '# TrojanSQL: SQL Injection against Natural Language Interface to Database\\nJinchuan Zhang 1 ,, Yan Zhou 1 ,, Binyuan Hui, Yaxin ${\\\\bf L i u^{1,2}}$ , Ziming ${\\\\bf L i}^{1,2}$ , Songlin $\\\\mathbf{H}\\\\mathbf{u}^{1,2}$ 1 Institute of Information Engineering, Chinese Academy of Sciences 2 School of Cyber Security, University of Chinese Academy of Sciences {zhangjinchuan, zhouyan, liuyaxin, liziming, husonglin}@iie.ac.cn\\n\\n# Abstract\\nThe technology of text-to-SQL has significantly enhanced the efficiency of accessing and manipulating databases. However, limited research has been conducted to study its vulnerabilities emerging from malicious user interaction. By proposing TrojanSQL, a backdoor-based SQL injection framework for text-to-SQL systems, we show how state-of-the-art text-to-SQL parsers can be easily misled to produce harmful SQL statements that can invalidate user queries or compromise sensitive information about the database. The study explores two specific injection attacks, namely boolean-based injection and union-based injection , which use different types of triggers to achieve distinct goals in compromising the parser. Experimental results demonstrate that both medium-sized models based on fine-tuning and LLM-based parsers using prompting techniques are vulnerable to this type of attack, with attack success rates as high as $99\\\\%$ and $89\\\\%$ , respectively. We hope that this study will raise more concerns about the potential security risks of building natural language interfaces to databases.\\n\\n# 1 Introduction\\nText-to-SQL, known as Natural Language Interface to Database (NLIDB), is designed to automatically convert user questions into executable SQL queries ( Zelle and Mooney ,1996 ;Li and Jagadish ,2014 ). It allows non-technical individuals to access the database without grasping SQL grammar or database details. As a result, this technology has given rise to a plethora of applications ( Lee et al. ,2022 ;Joseph et al. ,2022 ;Borges et al. ,2020 ).  \\n\\nHowever, limited research has been conducted to investigate the security aspects of natural language interfaces to databases despite the fact that database security is crucial for protecting sensitive information and preserving data integrity. To bridge this gap, we introduce the notion of SQL injection in the context of NLIDB. We define the action of inserting malicious text with the goal of misleading a text-to-SQL parser to generate harmful SQL statements as SQL injection against NLIDB . Nevertheless, how to implement such attacks remains an open question. In traditional web-based SQL injection (Figure 1(a) ), the attacker inserts malicious SQL statements (also known as payload ) into an input field by combining a guess for the back-end database query statement. An intuitive approach to performing SQL injection against NLIDB would be to follow the web-based injection and insert the payload directly into the user’s question to try to generate it as is, but this would be very conspicuous 1 and thus easily detected and filtered.  \\n\\n  \\nFigure 1: (a) Web-based SQL injecction . The attacker invalidates the password condition by typing \"’admin - -\" into the username field, where \"’\" closes the SQL statement and \"- -\" comments out the following content. (b) SQL injection against NLIDB . The attacker injects a backdoor into the text-to-SQL parser by poisoning the training data or prompt and then interacts with it to trigger the payload generation.  \\n\\nIn practice, training a fine-tuned parser typically involves data collection and model training. Data collection often relies on third-party data suppliers 2 or public datasets 3 from the web for annotation or data augmentation, considering the resource-intensive nature of manual annotation. Alternatively, developers may download pre-trained weights from public websites 4 to minimize training costs. However, this lack of control over the training process creates opportunities for adversaries to introduce backdoors into the models. For instance, adversaries can upload poisoned datasets or model weights to public websites, exploiting the insufficient safeguards in place.  \\n\\nThe emergence of powerful large language models (LLMs) has recently enabled the development of highly effective parsers with minimal demostration examples ( Chen et al. ,2023 ), indicating the potential for LLM-based parsers to serve as novel interfaces for databases ( Li et al. ,2023 ). Nevertheless, the exponential growth of LLM-based applications coupled with inadequate regulation creates an environment in which certain malicious service providers (MSPs) could exploit the invisibility of the prompt engineering process to offer users services that contain hidden backdoors.  \\n\\nBased on the characteristics of current text-toSQL parsers, we have developed a framework, TrojanSQL, to perform SQL injection on NLIDBs by data poisoning. It aims to include a hidden mapping for trigger to payload in the parser (Figure 1(b) ), which we refer to as the model’s backdoor. We implement TrojanSQL with two specific injection methods: boolean-based injection and union-based injection . The payloads of both injection methods are dynamically constructed from user questions and database schema, which makes it difficult for both humans and database engines to distinguish whether they are injection statements or normal requests. Thus, it is difficult to filter these payloads by simple heuristic rules. Additionally, we propose a sketch-based editing strategy to ensure that the entire statement is syntactically complete after the payload is inserted into the original SQL.  \\n\\nOverall, our contributions are as follows:  \\n\\n•To the best of our knowledge, we are the first to point out that NLIDB is at risk of being injected like web applications, and propose definitions and principles of SQL injection against NLIDB. Based on these principles, we designed a specific framework, TrojanSQL.  \\n\\n•We conducted extensive experiments and tested certain factors that affect the effectiveness of the attack. Experimental results show that only a small number of poisoned samples are needed to achieve a high attack success rate for both finetuning-based and LLM-based parsers.  \\n\\n•We attempted to defend against TrojanSQL by filtering poisoned samples, but found it difficult to remove them effectively. This reveals the potential of our framework as a way to build a red-teaming approach ( Ganguli et al. ,2022 ) for LLM in code scenarios to fill the gap of open-source red-teaming datasets for code generation 6 .', 'original_filename': 'Conf_Paper_Meta_Data_EMNLP_2023_with_whole_text.db'}}, {'id': 454845641342337844, 'distance': 0.572297215461731, 'entity': {'paper_id': '6461b9c9d68f896efad43133', 'paper_title': 'Interactive Text-to-SQL Generation Via Editable Step-by-Step Explanations', 'chunk_id': 0, 'chunk_text': '# Interactive Text-to-SQL Generation via Editable Step-by-Step Explanations\\nYuan Tian 1 , Zheng Zhang 2 , Zheng $\\\\mathbf{Ning^{2}}$ ,Toby Jia-Jun $\\\\mathbf{Li}^{2}$ ,Jonathan K. Kummerfeld 3 , and Tianyi Zhang 1 Purdue University 1 , University of Notre Dame 2 , The University of Sydney 3  , , , , ,\\n\\n# Abstract\\nRelational databases play an important role in business, science, and more. However, many users cannot fully unleash the analytical power of relational databases, because they are not familiar with database languages such as SQL. Many techniques have been proposed to automatically generate SQL from natural language, but they suffer from two issues: (1) they still make many mistakes, particularly for complex queries, and (2) they do not provide a flexible way for non-expert users to validate and refine incorrect queries. To address these issues, we introduce a new interaction mechanism that allows users to directly edit a stepby-step explanation of a query to fix errors. Our experiments on multiple datasets, as well as a user study with 24 participants, demonstrate that our approach can achieve better performance than multiple SOTA approaches. Our code and datasets are available at https: //github.com/magic-YuanTian/STEPS .\\n\\n# 1 Introduction\\nNatural language interfaces significantly lower the barrier to accessing databases and performing data analytics tasks for users who are not familiar with database query languages. Many approaches have been proposed for generating SQL queries from natural language ( Popescu et al. ,2004 ;Giordani and Moschitti ,2012 ;Rubin and Berant ,2021 ;Scholak et al. ,2021 ;Zhao et al. ,2022 ). Using recent large language models, systems have reached $86.6\\\\%$ execution accuracy ( Gao et al. ,2023 ) on the Spider benchmark ( Yu et al. ,2018 ).  \\n\\nHowever, the rate of improvement has slowed, with a gain of only $10\\\\%$ since mid-2021. This is partly due to the inherent ambiguity of natural language and the complex structure of SQL queries (e.g., nested or joined queries). Thus, it is challenging to generate a fully correct query in one step, especially for complex tasks ( Yao et al. ,2019 ).  \\n\\n  \\nFigure 1: Refining a SQL query by directly editing a step-by-step explanation.  \\n\\nThere has been growing interest in developing “human-in-the-loop” approaches that elicit user feedback to guide SQL generation. However, most approaches only support feedback in constrained forms, e.g., answering multiple-choice questions (MISP, PIIA, DialSQL Yao et al. ,2019 ;Li et al. ,2020 ;Gur et al. ,2018 ), changing SQL elements in a drop-down menu (DIY, Narechania et al. ,2021 ), etc. Such constrained feedback is not sufficient to fix many complex errors in real-world SQL tasks. One exception is NL-EDIT ( Elgohary et al. ,2021 ), which allows users to provide feedback as new utterances. However, since the feedback is open-ended, interpreting it can be just as hard as processing the original request.  \\n\\nIn this paper, we seek to strike a balance between constrained feedback and open-ended feedback by proposing a new interaction mechanism: editable step-by-step explanations. Fig. 1 illustrates our idea. This mechanism consists of three core components: (a) a text-to-SQL model, (b) an explanation generation method, and (c) a SQL correction model. Our key insight is that using a step-by-step explanation as the basis to suggest fixes allows users to precisely specify where the error is and how to fix it via direct edits. This not only saves users’ time but also makes it easier for the model to locate the error and apply fixes.  \\n\\nBased on this idea, we implemented an interactive SQL generation and refinement system called STEPS . S TEPS adopts a rule-based method to generate step-by-step explanations and uses a hybrid rule/neural method to convert a user-corrected explanation back to a SQL query.  \\n\\nAn evaluation with a simulated user on Spider ( Yu et al. ,2018 ) shows that S TEPS can achieve $97.9\\\\%$ exact set match accuracy, outperforming prior interactive text-to-SQL systems— MISP, DIY, and NL-EDIT—by $33.5\\\\%$ ,$33.2\\\\%$ , and $31.3\\\\%$ respectively. We further evaluate S TEPS on other datasets, including Spider-DK ( Gan et al. ,2021b ), Spider-Syn ( Gan et al. ,2021a ), and WikiSQL ( Zhong et al. ,2017 ). S TEPS consistently achieves at least $96\\\\%$ exact set match accuracy and execution accuracy across all datasets.  \\n\\nFinally, we conducted a within-subjects user study with 24 real users. We found that within the same amount of time, S TEPS helped users complete almost 2X and 4X more tasks correctly than DIY and MISP respectively, with significantly higher self-reported confidence and lower mental load.  \\n\\nThis work makes the following contributions: (1) we propose a new interaction mechanism for the text-to-SQL task; (2) we develop an interactive text-to-SQL system based on the new interaction mechanism and a new training method for SQL correction; (3) we conduct a comprehensive evaluation with both simulated and real users and demonstrate its effectiveness over state-of-the-art interactive systems. Our dataset and code are publicly available.', 'original_filename': 'Conf_Paper_Meta_Data_EMNLP_2023_with_whole_text.db'}}, {'id': 454845681927213884, 'distance': 0.567534863948822, 'entity': {'paper_id': '6535d747939a5f408295c649', 'paper_title': 'Benchmarking and Improving Text-to-SQL Generation under Ambiguity', 'chunk_id': 8, 'chunk_text': '# T5-3B\\n1. SELECT t1.file_name FROM documents AS t1 JOIN templates AS t2 ON t1.template_id $=$ t2.template_id WHERE t2.template_type_code $=$ “BK”   \\n2. SELECT t1.file_name, t1.document_description FROM documents AS t1 JOIN templates AS t2 ON t1.template_id $=12$ .template_id WHERE t2.template_type_code $=$ “BK”   \\n3. SELECT file_name FROM documents AS t1 JOIN templates AS $^{\\\\dag2}$ ON t1.template_id $\\\\l_{\\\\l}=\\\\mathrm{t}2$ .template_id WHERE t2.template_type_code $=$ “BK”   \\n4. SELECT t1.file_name, t1.record_name FROM documents AS t1 JOIN templates AS t2 ON t1.template_id $=$ t2.template_id WHERE t2.template_type_code $=$ “BK”   \\n5. SELECT t1.document_name FROM documents AS t1 JOIN templates AS t2 ON t1.template_id $=12$ .template_id WHERE t2.template_type_code $=$ “BK”  \\n\\n1. SELECT song_name, song_release_year FROM artist ORDER BY age LIMIT 1   \\n2. SELECT song_name, song_release_year FROM performer ORDER BY age LIMIT 1\\n\\n# LogicalBeam\\n1. SELECT song_name, song_release_year FROM artist ORDER BY age LIMIT 1   \\n2. SELECT t2.song_name, t2.song_release_year FROM artist AS t1 JOIN song_name AS t2 ON t1.song_name $=$   \\nt2.song_name ORDER BY t1.age LIMIT 1   \\n3. SELECT song_name, song_release_year FROM performer ORDER BY age LIMIT 1   \\n4. SELECT song_name, song_release_year FROM performer ORDER BY age asc LIMIT 1   \\n5. SELECT song_name, song_release_year FROM concert ORDER BY age LIMIT 1\\n\\n# ChatGPT\\n1. SELECT name, song_release_year FROM performer WHERE age $=$ (SELECT MIN(age) FROM performer) LIMIT 1   \\n2. SELECT name, song_release_year FROM artist WHERE age $=$ (SELECT MIN(age) FROM artist) LIMIT 1   \\n3. SELECT t1.name, t1.song_release_year FROM performer AS t1 JOIN singer_in_concert AS t2 ON t1.singer_id  $=12$ .singer_id WHERE t1.age $=$ (SELECT MIN(age) FROM performer) LIMIT 1  \\n4. SELECT t1.name, t1.song_release_year FROM artist AS t1 JOIN singer_in_concert AS t2 ON t1.singer_id $=$ t2.singer_id WHERE t1.age $=$ (SELECT MIN(age) FROM artist) LIMIT 1   \\n5. SELECT t1.name, t1.song_release_year FROM performer AS t1 JOIN singer_in_concert AS t2 ON t1.singer_id $=$ t2.singer_id WHERE t1.age $=$ (SELECT MIN(age) FROM (SELECT \\\\* FROM performer UNION SELECT \\\\* FROM artist)) LIMIT 1\\n\\n# T5-3B\\n1. SELECT song_name, song_release_year FROM artist ORDER BY age LIMIT 1   \\n2. SELECT t2.song_name, t2.song_release_year FROM artist AS t1 JOIN song_name AS t2 ON t1.song_name $=$ t2.song_name ORDER BY t1.age LIMIT 1   \\n3. SELECT t2.song_name, t2.song_release_year FROM artist AS t1 JOIN song_name AS t2 ON t1.song_name $=$ t2.song_name WHERE t1.age $=$ (SELECT MIN(age) FROM artist)   \\n4. SELECT song_name, song_release_year FROM artist ORDER BY age asc LIMIT 1   \\n5. SELECT t2.song_name, t2.song_release_year FROM artist AS t1 JOIN performer AS t2 ON t1.singer_id $=$ t2.singer_id ORDER BY t1.age LIMIT 1\\n\\n# (J)oin Ambiguity\\n1. SELECT name, location, district FROM shop ORDER BY number_products DESC 2. SELECT t1.name, t1.location, t2.district FROM shop AS t1 JOIN shop_district AS t2 ON t1.shop_id $=12$ .shop_id ORDER BY t1.number_products DESC\\n\\n# LogicalBeam\\n1. SELECT name, location, district FROM shop ORDER BY number_products DESC   \\n2. SELECT name, location and district FROM shop ORDER BY number_products DESC   \\n3. SELECT name, district, location FROM shop ORDER BY number_products DESC   \\n4. SELECT name, district FROM shop ORDER BY number_products DESC   \\n5. SELECT t1.name, t1.location, t2.district FROM shop AS t1 JOIN shop_district AS t2 ON t1.shop_id $=$ t2.shop_id   \\nORDER BY t1.number_products DESC\\n\\n# ChatGPT\\n1. SELECT name, location, district FROM shop JOIN shop_district ON shop.shop_id $=$ shop_district.shop_id ORDER BY number_products DESC   \\n2. SELECT name, location, district FROM shop_district JOIN shop ON shop.shop_id $=$ shop_district.shop_id ORDER BY number_products DESC   \\n3. SELECT s.name, s.location, sd.district FROM shop AS s JOIN shop_district AS sd ON s.shop_id $=$ sd.shop_id JOIN shop AS s2 ON s2.shop_id $=$ sd.shop_id GROUP BY s.name, sd.district, s.location ORDER BY sum(s2.number_products) DESC   \\n4. SELECT s.name, s.location, sd.district FROM shop_district AS sd JOIN shop AS s ON s.shop_id $=$ sd.shop_id JOIN shop AS s2 ON s2.shop_id $=$ sd.shop_id GROUP BY s.name, sd.district, s.location ORDER BY sum(s2.number_products) DESC   \\n5. SELECT name, location, district FROM shop_district JOIN shop ON shop.shop_id $=$ shop_district.shop_id ORDER BY (SELECT count $^{*})$ FROM employee WHERE employee.city $=$ shop.location) DESC\\n\\n# T5-3B\\n1. SELECT name, location, district FROM shop ORDER BY number_products DESC   \\n2. SELECT name, location and district FROM shop ORDER BY number_products DESC   \\n3. SELECT name, location, district FROM shop ORDER BY number_products DESC   \\n4. SELECT name, location, district form shop ORDER BY number_products DESC   \\n5. SELECT name, location, district of shop ORDER BY number_products DESC   \\n1. SELECT AVG(capacity), MAX(capacity) FROM stadium   \\n2. SELECT avg_capacity, max_capacity FROM stadium_capacity\\n\\n# LogicalBeam\\n1. SELECT avg_capacity, max_capacity FROM stadium_capacity   \\n2. SELECT avg_capacity (SELECT AVG(capacity) FROM stadium_capacity) and max_capacity (SELECT   \\nMAX(capacity) FROM stadium)   \\n3. SELECT AVG(capacity), MAX(capacity) FROM stadium   \\n4. SELECT AVG(capacity), MAX(max_capacity) FROM stadium_capacity   \\n5. SELECT AVG(capacity), MAX(capacity), stadium_id FROM stadium_capacity\\n\\n# ChatGPT\\n1. SELECT AVG(capacity), MAX(capacity) FROM stadium   \\n2. SELECT AVG(t1.capacity), MAX(t1.capacity) FROM stadium AS t1   \\n3. SELECT AVG(t1.capacity), t1.max_capacity FROM stadium_capacity AS t1   \\n4. SELECT AVG(t1.capacity), MAX(t2.capacity) FROM stadium AS t1 JOIN stadium AS t2 on t1.capacity $<=$ t2.capacity group by t1.capacity   \\n5. SELECT AVG(t2.capacity), MAX(t2.capacity) FROM concert AS t1 JOIN stadium AS t2 on t1.stadium_id $=$ t2.stadium_id\\n\\n# T5-3B\\n1. SELECT avg_capacity, max_capacity FROM stadium_capacity   \\n2. SELECT avg_capacity (SELECT AVG(capacity) FROM stadium_capacity) and max_capacity (SELECT MAX(capacity) FROM stadium)   \\n3. SELECT avg_capacity (SELECT AVG(capacity) FROM stadium_capacity) and max_capacity (SELECT max_capacity FROM stadium_capacity) FROM stadium   \\n4. SELECT AVG(capacity), MAX(capacity) FROM stadium_capacity   \\n5. SELECT avg_capacity (SELECT AVG(capacity) FROM stadium_capacity) and max_capacity (SELECT MAX(capacity) FROM stadium_capacity)', 'original_filename': 'Conf_Paper_Meta_Data_EMNLP_2023_with_whole_text.db'}}, {'id': 454919307609527472, 'distance': 0.5674996376037598, 'entity': {'paper_id': '63608e5090e50fcafdee1152', 'paper_title': 'Diverse Parallel Data Synthesis for Cross-Database Adaptation of   Text-to-SQL Parsers', 'chunk_id': 3, 'chunk_text': '# 2.3 Filtering the Generated Text\\nSince the data synthesized using R EFILL is used to fine-tune a downstream Text-to-SQL parser, we learn a Filtering model inconsistent examples from the generated dataset. $\\\\mathcal{F}:(\\\\mathcal{X},\\\\mathcal{Q})\\\\mapsto\\\\mathbb{R}$ to discard $\\\\mathcal{F}$ assigns lower scores to inconsistent Text-SQL pairs. For each SQL $q\\\\in\\\\mathcal{Q}\\\\mathcal{W}_{s}$ , we select the top-5 sentences generated by R EFILL and discard all the sentences that are scored below a fixed threshold as per the filtering model. Existing work depended on a trained Text-to-SQL parser consistency scores ( Zhong et al. $\\\\mathcal{M}$ ,2020 to assign cycle). However, we show that cycle-consistency filtering favors text on which not result in a useful dataset for fine-tuning $\\\\mathcal{M}$ already performs well, and henc M.oes $\\\\{(x_{i},q_{i})\\\\}$ classifier, independent of itive (consistent) examples and we synthetically We instead train a filte in the training set M. The Text-SQL pairs model $\\\\ensuremath{\\\\mathcal{D}}_{\\\\mathrm{train}}$ , serve as pos$\\\\mathcal{F}$ as a binary generate the negative (inconsistent) examples as follows: (i) Replace DB values in the ${\\\\mathrm{SQL~}}q_{i}$ with arbitrary values sampled from the same column of the database. (ii) Replace SQL-specific tokens in $q_{i}$ with their corresponding alternates e.g. replace ASC with DESC , or $\\\\bullet\\\\,\\\\neg\\\\,\\\\ \\'$ with $\\\\acute{\\\\bullet}<\\\\ \\'$ . (iii) Cascade previous two perturbations. (iv) Replace the entire SQL $q_{i}$ with a randomly chosen SQL $q_{j}$ from the same schema. (v) Randomly drop tokens in the text query $x_{i}$ with a fixed probability of 0.3. (vi) Shuffle a span of tokens in the text query $x_{i}$ , with span length set to $30\\\\%$ of the length of $x_{i}$ . Thus, for a given Text-SQL pair $(x_{i},q_{i})$ we obtain six corresponding negative pairs $\\\\{(x_{j}^{n},q_{j}^{n})\\\\}_{j=1}^{6}$ . Let $s_{i}$ be the score provided by the filtering model for the original pair $(x_{i},q_{i})$ and $\\\\{s_{j}\\\\}_{j=1}^{6}$ be the scores assigned to the corresponding negative pairs supervise the scores from the filtering model us$\\\\{(x_{j}^{n},q_{j}^{n})\\\\}_{j=1}^{6}$ . We ing a binary-cross-entropy loss over the Sigmoid activations of scores as in Equation 1 .  \\n\\n$$\\n\\\\mathcal{L}_{\\\\mathrm{bce}}=-\\\\log\\\\sigma(s_{i})-\\\\sum_{j=1}^{6}\\\\log\\\\sigma(1-s_{j})\\n$$  \\n\\nTo explicitly contrast an original pair with its corresponding negative pairs we further add another Softmax-Cross-Entropy loss term.  \\n\\n$$\\n\\\\mathcal{L}_{\\\\mathrm{xent}}=-\\\\log\\\\frac{\\\\exp(s_{i})}{\\\\exp(s_{i})+\\\\sum_{j=1}^{6}\\\\exp(s_{j})}\\n$$\\n\\n# 3 Related Work\\nSQL-to-Text generation Many prior works perform training data augmentation via pre-trained text generation models that translate SQLs into natural text ( Guo et al. ,2018 ;Zhong et al. ,2020 ;Shi et al. ,2020 ;Zhang et al. ,2021 ;Wang et al. ,2021 ;Yang et al. ,2021 ;Shu et al. ,2021 ). For example, Wang et al. (2021 ) fine-tune BART ( Lewis et al. ,2020 ) on parallel SQL-Text pairs to learn an SQLto-Text translation model. Shu et al. (2021 ) propose a similar model that is trained in an iterativeadversarial way along with an evaluator model. The evaluator learns to identify inconsistent SQL-Text pairs, similar to our filtering model. To retain high quality synthesized data Zhong et al. (2020 ) additionally filter out the synthesized pairs using a pre-trained Text-to-SQL model based on cycle consistency, that we show to be sub-optimal $\\\\left(\\\\S\\\\ 5.5\\\\right)$ .The SQL workload in most of the prior work was typically sampled from hand-crafted templates or a grammar like PCFG induced from existing SQLs, or crawling SQLs from open-source repositories Shi et al. (2020 ). However, database practitioners have recently drawn attention to the fact that SQL workloads are often pre-existing and should be utilized ( Baik et al. ,2019 ).  \\n\\nRetrieve and Edit Methods Our method is related to the retrieve-and-edit framework, which has been previously applied in various NLP tasks. In Semantic Parsing, question and logical-form pairs from the training data relevant to the test-input question are retrieved and edited to generate the output logical forms in different ways ( Shaw et al. ,2018 ;Das et al. ,2021 ;Pasupat et al. ,2021 ;Gupta et al. ,2021 ). In machine translation, memory augmentation methods retrieve-and-edit examples from translation memory to guide the decoder’s output ( Hossain et al. ,2020 ;Cai et al. ,2021 ). Our editing step — masking followed by refilling is similar to style transfer methods that minimally modify the input sentence with help of retrieved examples corresponding to a target attribute ( Li et al. ,2018 ). In contrast to learning a retriever, we find simple tree-edit distance to be an effective metric for retrieving the relevant examples for our task.\\n\\n# 4 Experimental Set-up 1\\nWe adapt pretrained Text-to-SQL parsers on multiple database schemas unseen during the train time. Here, we describe the datasets, models, and evaluation metrics used in our experiments.  \\n\\nDatasets : We primarily experiment with the Spider dataset ( Yu et al. ,2018 ). Spider’s train split contains 7000 Text-to-SQL examples spanning 140 database schemas, and the dev split contains roughly 1000 examples spanning 20 schemas 2 .Since individual schemas in the dev split typically contain less than 50 examples, to evaluate on a larger set of examples we adapt and evaluate the Text-to-SQL parser on groups of similar schemas instead of individual schemas. We create 4 groups, with each group having database schemas from a similar topic. For example, Group-1 consists of databases {Singer, Orchestra, Concerts} .We utilize all the available Text-SQL pairs in each group for evaluation. In appendix Table A.1 , we provide detailed statistics about each group. On average, each group contains 69 unique SQLs and 131 evaluation examples. To simulate a query workload $\\\\mathcal{Q}\\\\mathcal{W}_{s}$ for each group, we randomly select $70\\\\%$ of the available SQLs and replace the constant-values in the SQLs with values sampled from their corresponding column in the database. We also evaluate on query workloads of size $30\\\\%$ and $50\\\\%$ of the available SQL queries. The SQL queries in the workload are translated using R EFILL or an SQLto-Text model, and the resulting Text-SQL pairs are then used to fine-tune a base Text-to-SQL parser.  \\n\\n<html><body><table><tr><td></td><td colspan=\"2\">Group 1</td><td colspan=\"2\">Group 2</td><td colspan=\"2\">Group 3</td><td colspan=\"2\">Group 4</td><td colspan=\"2\">Average</td></tr><tr><td>Method</td><td>EM</td><td>EX</td><td>EM</td><td>EX</td><td>EM</td><td>EX</td><td>EM</td><td>EX</td><td>EM</td><td>EX</td></tr><tr><td>BASE-M</td><td>80.9</td><td>84.3</td><td>64.8</td><td>67.2</td><td>64.0</td><td>65.9</td><td>45.8</td><td>35.8</td><td>63.8</td><td>63.3</td></tr><tr><td>L2S (Wang et al., 2021)</td><td>88.7</td><td>87.8</td><td>61.3 58.9</td><td>62.1</td><td>62.8</td><td>61.0</td><td>42.5</td><td>35.0</td><td>63.8</td><td>61.4</td></tr><tr><td>GAZP (Zhong et al., 2020)</td><td>85.2</td><td>85.2 87.8</td><td>59.7</td><td>66.9 60.5</td><td>70.1 64.0</td><td>60.5</td><td>52.5</td><td>40.8</td><td>66.6</td><td>63.3</td></tr><tr><td>SNOWBALL (Shu et al.,2021)</td><td>85.2</td><td></td><td></td><td></td><td></td><td>65.9</td><td>44.2</td><td>38.3</td><td>63.2</td><td>63.1</td></tr><tr><td>REFILL (Ours)</td><td>88.7</td><td>87.0</td><td>69.7</td><td>73.8</td><td>73.2</td><td>70.1</td><td>55.8</td><td>45.0</td><td>71.8</td><td>68.9</td></tr></table></body></html>\\n\\nTable 1: Results for finetuning a base semantic parser (S MBOP) on Text-SQL pairs generated using various SQLto-Text baselines and R EFILL $(\\\\S\\\\,5.1)$ . R EFILL provides consistent gains over the base model across all the database groups, while gains from other methods are often negative or small.  \\n\\nWe further experiment with four datasets outside Spider in Section 5.6 . We work with GeoQuery ( Zelle and Mooney ,1996 ), Academic ( Li and Jagadish ,2014 ), IMDB and Yelp ( Navid Yaghmazadeh and Dillig ,2017 ). We utilize the preprocessed version of these datasets open-sourced by Yu et al. (2018 ). In appendix Table A2 , we present statistics about each of the four datasets.  \\n\\nText-to-SQL parser : We experiment with S M-BOP ( Rubin and Berant ,2021 ) as our base Textto-SQL parser, and utilize author’s implementation. The S MBOP model is initialized with a R OBERT A -BASE model, followed by four RAT layers, and trained on the train split of Spider dataset. The dev set used used for selecting the best model excludes data from the four held-out evaluation groups.  \\n\\nEdit and Fill model : We utilize a pre-trained BARTBASE as our conditional text generation model for editing and filling the masked text. The model is fine-tuned using the train split of Spider dataset as described in Section 2.2  \\n\\nFiltering Model : We train a binary classifier based on a ROBERTABASE checkpoint on Spider’s train split to filter out inconsistent SQL-Text pairs as described in Section 2.3 .  \\n\\nBaselines : For baseline SQL-to-Text generation models, we consider recently proposed models like L2S ( Wang et al. ,2021 ), GAZP ( Zhong et al. ,2020 ), and S NOW BALL (Shu et al. ,2021 ). All the baselines utilize pre-trained language models like BART ( Lewis et al. ,2020 ) or BERT ( Devlin et al. ,2018 ) for translating SQL tokens to natural text in a standard seq-to-seq set-up. The baselines mostly differ in the way of encoding SQL tokens as an input to the language model. In Section 3 , we reviewed the recent SQL-to-Text methods.  \\n\\nEvaluation Metrics We evaluate the Text-to-SQL parsers using the Exact Set Match (EM), and the Exection Accuracy (EX) Yu et al. (2018 ). The EM metric measures set match for all the SQL clauses and returns 1 if there is a match across all the clauses. It ignores the DB-values (constants) in the SQL query. The EX metric directly compares the results obtained by executing the predicted query $\\\\hat{q}$ and the gold query $q$ on the database.  \\n\\nWe provide more implementation details including the hyperparameter settings in appendix A.6 .', 'original_filename': 'Conf_Paper_Meta_Data_EMNLP_2022_Empirical_Methods_in_Natural_Language_Processing_with_whole_text.db'}}]\n"
     ]
    }
   ],
   "source": [
    "# Test search_papers API\n",
    "print(\"\\nTesting /search_papers API...\")\n",
    "query_params = {\"query\": \"Text2SQL\", \"top_k\": 10}\n",
    "data = call_api(\"search_papers\", params=query_params)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "19ef46c5-0bd1-4f09-a30a-1dd36bd88e2d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Testing /query_by_paper_id API...\n",
      "[SUCCESS] query_by_paper_id - Response:\n",
      "[{'id': 454846633242875330, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 0, 'chunk_text': '# Constraint Reasoning Embedded Structured Prediction\\nNan Jiang   \\nDepartment of Computer Science Purdue University   \\nWest Lafayette, Indiana, USA.  \\n\\n Maosen Zhang ByteDance Beijing, China.  \\n\\n Willem-Jan van Hoeve Tepper School of Business Carnegie Mellon University Pittsburgh, Pennsylvania, USA.  \\n\\n  \\n\\nYexiang Xue   \\nDepartment of Computer Science Purdue University   \\nWest Lafayette, Indiana, USA.  \\n\\n Editor: Maya Gupta\\n\\n# Abstract\\nMany real-world structured prediction problems need machine learning to capture data distribution and constraint reasoning to ensure structure validity. Nevertheless, constrained structured prediction is still limited in real-world applications because of the lack of tools to bridge constraint satisfaction and machine learning. In this paper, we propose CO nstraint RE asoning embedded Structured Prediction ( Core-Sp ), a scalable constraint reasoning and machine learning integrated approach for learning over structured domains. We propose to embed decision diagrams, a popular constraint reasoning tool, as a fullydifferentiable module into deep neural networks for structured prediction. We also propose an iterative search algorithm to automate the searching process of the best Core-Sp structure. We evaluate Core-Sp on three applications: vehicle dispatching service planning, if-then program synthesis, and text2SQL generation. The proposed Core-Sp module demonstrates superior performance over state-of-the-art approaches in all three applications. The structures generated with Core-Sp satisfy 100% of the constraints when using exact decision diagrams. In addition, Core-Sp boosts learning performance by reducing the modeling space via constraint satisfaction.  \\n\\nKeywords: Constraint Reasoning, Decision Diagrams, Structured Prediction.', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633277740484, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 1, 'chunk_text': '# 1. Introduction\\nThe emergence of large-scale constraint reasoning and machine learning technologies have impacted virtually all application domains, including marketing, linguistics, operations, retail, robotics, and health care. Constraint reasoning has traditionally been applied to building prescriptive models that generate solutions for strategic, tactical, or operational use (Choi et al., 2012). It requires a precise problem description and is usually difficult to be made flexible to the evolving data distributions. Machine learning, on the other hand, has been applied primarily to build predictive models, such as classifications or regressions (Michalski and Anderson, 1984; Bishop, 2007). While the structure of a machine learning model (like a neural network) must be designed, the actual model parameters are learned automatically via gradient descent algorithms. This gives machine learning models the flexibility to adapt to the evolving data distributions. Nevertheless, it is difficult to enforce constraints on the output of machine learning models. Many real-world applications are beyond the reach of constraint reasoning or machine learning alone.  \\n\\n  \\nFigure 1: (a) Our proposed Core-Sp framework embeds constraint reasoning in machine learning for structured prediction. We demonstrate the effectiveness of Core-Sp on vehicle dispatching service, if-then program synthesis, and Text2SQL generation tasks. (b) At a high level, Core-Sp (in orange colored box) is a fully differentiable layer that simulates a path descending in the corresponding decision diagram. Core-Sp filters out the infeasible output from the structured output to ensure constraint satisfaction.  \\n\\nIn this paper, we focus on structured prediction problems, which is a class of learning problems requiring both constraint reasoning and machine learning. It expands the output space of classification problems into high-dimensional structured space. Structured prediction has diverse application domains, ranging from natural language processing (Socher et al., 2013), social network analysis (Xiang and Neville, 2013), and ecological modeling (Tang et al., 2018; Chen et al., 2018). The applications we consider in this paper all require tight integration of constraint reasoning and machine learning. Our first application vehicle dispatching service planning is to recommend a route that satisfies the daily service needs as well as meeting the drivers’ preferences. Historical data may reveal that the drivers do not follow common stylized objectives such as minimizing distance or time. Therefore standard constraint reasoning tools, e.g. , solvers for the traveling salesman problem, cannot be applied. While we need machine learning to capture the drivers’ objective functions, pure machine learning-based approaches are insufficient because they often generate routes that violate delivery requests. Our second and third applications are program synthesis from natural language , which clearly requires machine learning to generate structured programs. Nevertheless, a pure learning approach cannot enforce the syntactic and semantic rules of those programs.  \\n\\nWe propose Co nstraint Re asoning embedded Structured Prediction ( Core-Sp ), a scalable constraint reasoning and machine learning integrated approach for learning over the structured domains. The main idea is to augment structured predictive models with a constraint reasoning module that represents physical and operational requirements. Specifically, we propose to embed decision diagrams (Akers, 1978; Bryant, 1986), a popular constraint reasoning tool, as a fully-differentiable module into deep neural networks. A decision diagram is a compact graphical representation of the constraints. It encodes each solution (an assignment of values to variables satisfying the constraints) as a path from the root to the terminal in the diagram. Core-Sp regards the neural network predictions as the simulation of descending along a path in the decision diagram. To ensure constraint satisfaction, Core-Sp filters out variable assignments from the neural network predictions that violate constraints. With the integration of Core-Sp , we provide structured prediction models with constraint satisfaction assurances. Moreover, structured prediction models with the Core-Sp layer enjoy a smaller prediction space than traditional structured prediction approaches, allowing our approach to learn faster in training and generalize better in testing. See Figure 1(a) for our proposed Core-Sp model which integrates constraint reasoning and machine learning for the three application domains. The high-level idea of Core-Sp is illustrated in Figure 1(b).  \\n\\nPrevious approaches have considered regularizing machine learning with constraint reasoning in various application domains. Within the broader context of learning constrained models, the work of Coletta et al. (2003); Lallouet et al. (2010); Beldiceanu and Simonis (2012); Bessiere et al. (2017); Addi et al. (2018) have studied automating the constraint acquisition process from historic data or (user-)generated queries. These approaches use partial or complete examples to identify the constraints that can be added to the model. The type of constraints that can be learned depends on the formulation. Several works (Punyakanok et al., 2004; Roth and Yih, 2005; Amos and Kolter, 2017; Ferber et al., 2020) enable learning in a constrained domain via encoding mathematical programming, such as quadratic programming or mixed integer linear programming, as a neural network layer. Deutsch et al. (2019) propose to formulate the output space as an automata. They use the constraints to prune all the invalid transitions in the automata to ensure the validity of the structured outputs. In addition, constraints imposed by a knowledge graph have been embedded into the neural network as differentiable layers (Peters et al., 2019; Wu et al., 2017). Zeng et al. (2021) and Heim (2019) enforce physical constraints or expert inputs as soft constraints. We will illustrate the difference between our approach and these methods in Section 3.2. A different approach is to embed a machine learning model into optimization, e.g. , by extending a constraint system with appropriate global constraints. For example, Lallouet and Legtchenko (2007) integrate neural networks and decision trees with constraint programming, while Lombardi et al. (2017) and Lombardi and Gualandi (2016) introduce a “Neuron” global constraint that represents a pre-trained neural network. Another series of approaches based on grammar variational autoencoders (Kusner et al., 2017; Dai et al., 2018; Jin et al., 2018) use neural networks to encode and decode from the parse-tree of a context-free grammar to generate discrete structures. Such approaches are used to generate chemical molecule expressions, which represent a structured domain. Machine learning approaches have also been used to solve constraint reasoning and optimization problems. This includes the works of Galassi et al. (2018) and Vinyals et al. (2015), which use neural networks to extend partial solutions to complete ones. Bello et al. (2017) handle the traveling salesman problem by framing it as reinforcement learning. Selsam et al. (2019) proposes to learn an SAT solver from single-bit supervision. Approaches based on neural Turing machines (Graves et al., 2016) employ neural networks with external memory for discrete structure generation. More recently, Khalil et al. (2017) tackle the combinatorial optimization problems in graphs, by employing neural networks to learn the heuristics in the backtrack-free search. There is also a recent trend to synthesize programs using machine learning (Guu et al., 2017; Shi et al., 2019).  \\n\\nIn experimental analysis, we demonstrate the effectiveness of Core-Sp on the following three applications: (1) Vehicle Dispatching Service Planning : a route planning problem that recommends routes to drivers to meet the service needs while satisfying the drivers’ preferences. The implicit preferences of drivers are learned from the historical traveling data. The input of this problem is the daily service requests. The output is the permutations of the service locations, representing the sequential order that the locations should be visited by the drivers. This task requires machine learning models to capture drivers’ preferences from the traveling data, and constraint reasoning to ensure the satisfaction of service requests. (2) If-then Program Synthesis : the task is to automatically synthesize conditional programs from the natural language. Automatic program synthesis tools are useful to streamline the program of a few online services such as IFTTT and Zapier. The if-then program is in the form of: if trigger function happens in the trigger service , then take the action function from the action service . The machine learning task, therefore, is to predict the quadruple ( trigger service ,trigger function ,action service ,action function ). This application again requires machine learning to understand the semantics of the natural language, as well as constraint reasoning to satisfy the syntactic rules of the programs. (3) Text2SQL Generation : our last application is to automatically generate SQL queries that extract information from a database to answer a question posed in natural language. The neural model is used to understand the user’s queries in natural language while the constraint reasoning tool is applied to ensure the model generates grammaticallyvalid SQL queries.  \\n\\nOur proposed Core-Sp framework demonstrates superior performance against the stateof-the-art approaches in all three applications. First, the structures generated by Core-Sp are better in constraint satisfaction. In vehicle service dispatching, all Core-Sp generated routes are valid, while a conditional generative adversarial network (cGAN) without CoreSp generates on average less than $1\\\\%$ of valid routes when handling medium-sized delivery requests. We also apply a post-processing step (Deudon et al., 2018) to boost cGAN’s performance, but it cannot handle the complexity brought by the large combinatorial space of the routing problem. Its performance quickly defaults to the case without post-processing as the number of delivery locations increases. For if-then program synthesis, the percentage of valid programs produced increased from 88% to 100% with the Core-Sp module incorporated into the state-of-the-art LatentAttention model (Liu et al., 2016). For Text2SQL, the percentage of valid SQL queries increased from 83 .7% to 100% with Core-Sp incorporated into the state-of-the-art SQLNova model (Hwang et al., 2019) on a hard testing set. Core-Sp also improves the learning performance of structured prediction models. We show that the routes generated by Core-Sp better fulfill drivers’ preferences than cGAN without Core-Sp . In if-then program synthesis, Core-Sp module leads to approximately $2.0\\\\%$ improvement in accuracy compared with the state-of-the-art LatentAttention model and converges to models with higher accuracy in fewer training epochs. In Text2SQL generation, the Core-Sp module improves around 4 .2% in execution accuracy and 1 .9% in logical accuracy against SQLNova on a challenging test set.', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633313916358, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 2, 'chunk_text': \"# 2.1 Structured Prediction\\nStructured prediction expands the output space of classification problems into a highdimensional combinatorial space (Bakır et al., 2007). Specifically, given a set of inputoutput samples $\\\\mathcal{D}^{t r}\\\\,=\\\\,\\\\{({\\\\boldsymbol{x}}^{(i)},{\\\\boldsymbol{y}}^{(i)})\\\\}_{i=1}^{N}$ drawn i.i.d. from some unknown distribution over the space $\\\\mathcal X\\\\times\\\\mathcal Y$ , a structured pr tion model learns a conditional distribution $p_{\\\\theta}(y|x)$ ,for all $(x,y)\\\\,\\\\in\\\\,\\\\mathcal{X}\\\\times\\\\mathcal{Y}$ from data D$\\\\mathcal{D}^{t r}$ , where $\\\\theta$ denotes the parameters of the structured prediction model. Note that the output space ${\\\\boldsymbol{\\\\mathcal{D}}}=\\\\{0,1\\\\}^{\\\\iota}$ is a high dimensional space of combinatorial structures. The three applications we consider in this paper are all structured prediction problems. In vehicle dispatching service planning, the structured outputs are the delivery routes on a map. In if-then program synthesis, the structured outputs are the programs that complete web-service tasks. In Text2SQL generation, the structured outputs are the SQL queries that follow the SQL grammar.  \\n\\nIn the literature, various approaches have been proposed for structured prediction problems. The classifier chain approach (Read et al., 2015) decomposes the joint likelihood into a product of conditionals and reduces the structured prediction problem into a series of binary prediction problems. In this approach, the error tends to propagate along the classifier chain, which limits its effectiveness (Dembczynski et al., 2010). Energy-based modeling, such as conditional random fields (Lafferty et al., 2001; Geman and Geman, 1984) and structured prediction energy networks (Belanger and McCallum, 2016) learn to assign a high likelihood to structures that exist in the training data set while keeping the likelihood low for unseen structures. Constraints can be incorporated into these models as prior terms in the energy function but approximated inference is required to compute the intractable partition function, which often hinders their scalability. Another line of research uses structured support vector machines (Tsochantaridis et al., 2005), which apply hinge loss and row generation approaches for structured prediction; however, these were superseded in performance by later neural-network-based approaches. Recently, generative models, such as conditional generative adversarial networks (Mirza and Osindero, 2014; Goodfellow et al., 2014), flow models (Rezende and Mohamed, 2015), and sequence-to-sequence models (Sutskever et al., 2014) have become increasingly popular for structured prediction. These models use highly flexible neural networks to increase model capability. The over-parameterized networks with gradient descent-based optimization can learn better representation for the structures than the classic shallow models. However, it is not straightforward to enforce constraints into the neural network-based models.  \\n\\nConstraints in Structured Prediction. Often the structured output space $\\\\mathcal{V}$ is subject to additional constraints $\\\\scriptscriptstyle\\\\mathcal{C}$ . The conditional probability that $y$ takes values that violate the (physical) constraints $\\\\mathcal{C}$ given the input $x$ is zero. Such information is known prior to the training of the machine learning model. Formally, we have:  \\n\\n$$\\np(y|x)\\\\left\\\\{\\\\int>0\\\\quad{\\\\mathrm{if~}}y{\\\\mathrm{~satisfies~}}{\\\\mathcal{C}},\\\\right.\\n$$  \\n\\nTake the first task discussed in this paper as an example. A valid delivery route should cover all the requested locations and should only visit each location once. Thus, the machine learning model should assign zero probability to those invalid routes. Notice that the constraints are often intricate and the inference problem of finding a valid structure satisfying constraints cannot be decomposed into independent small problems. After learning, the inference problem is to predict the structured output $y$ given the input $x$ . Such inference problems can be solved by either Maximum A Posteriori (MAP) inference, e.g. , computing $m a x_{y}\\\\ p(y|x)$ or marginal inference, e.g. , computing $\\\\mathbb{E}_{y}[p(y|x)]$ . Learning structured prediction models involves solving the inference problems within the learning loop, hence having an even higher complexity.  \\n\\nCombinatorial constraints render both the inference and the learning problems highly intractable. Indeed, much effort has been made to improve the efficiency of both the inference and learning problems (Pan and Srikumar, 2018; Bello et al., 2020). For example, Niculae et al. (2018) propose the sparseMAP function which solves the inference problem by returning a few sparse structures that attain high likelihoods. This inference method sits between the MAP and marginal inference. In their problem setup, sparseMAP can be solved via quadratic programming. However, combinatorial constraints considered in this paper make the inference problem non-convex, even for a fixed structured prediction model, let alone the more challenging learning problem. Overall, constrained structured prediction presents two main challenges. The first is the sample complexity , since massive data is needed to learn an accurate model in an exponentially large space. The second is the computational complexity , since it is combinatorially intractable (unless P=NP) to generate structured outputs subject to complicated constraints.  \\n\\nSequence-to-sequence Structured Prediction. Our proposed Core-Sp method is designed to extend sequence-to-sequence models, which are recently proposed popular structured prediction models (Sutskever et al., 2014). The sequence-to-sequence model uses the re-parameterization trick to model the conditional probability $p_{\\\\theta}(y|x)$ , where $x\\\\in\\\\mathscr{X}$ denotes the input variables and $y\\\\in\\\\mathcal{V}$ is the structured output. Here $\\\\theta$ denotes the parameters of the neural model. Instead of modeling the probability $p_{\\\\theta}(y|x)$ directly, the model introduces an additional random variable $\\\\mathcal{Z}$ and models it as a deterministic transformation from random variable $\\\\mathcal{Z}$ and evidence $x$ to the output $y$ . In other words, the conditional probability $p_{\\\\theta}(y|x)$ is an integral over random variable $z$ in the following way:  \\n\\n$$\\n\\\\begin{array}{c}{{p_{\\\\theta}(y|x)=\\\\displaystyle\\\\int p_{\\\\theta}(y|x,z)p(z)\\\\;d z,}}\\\\\\\\ {{p_{\\\\theta}(y|x,z)=\\\\mathbb{1}\\\\{y=f_{\\\\theta}(x,z)\\\\},}}\\\\end{array}\\n$$  \\n\\nwhere we assume $\\\\mathcal{Z}$ is from a known prior probability distribution $p(z)$ . As a result, we only need to model $p_{\\\\theta}(y|x,z)$ for the overall model $p_{\\\\theta}(y|x)$ . We further assume that $p_{\\\\theta}(y|x,z)$ is given in the form of a deterministic function. We let $f_{\\\\theta}(x,z)\\\\in\\\\mathcal{D}$ be a deterministic mapping from inputs $(x,z)$ to an output in the structured space $\\\\boldsymbol{y}$ . The indicator function $\\\\mathbb{I}\\\\{\\\\cdot\\\\}$ evaluates to $1$ if and only if $y=f_{\\\\boldsymbol{\\\\theta}}(x,z)$ . This formulation is closely related to the generative adversarial network and gives us high flexibility to model multi-modal distributions. Take the vehicle dispatching service planning as an example. The input $x$ is the daily service requests and $y$ is the suggested dispatching route. There can be several routes that meet the service demands and satisfy the driver’s underlying preference function. In this case, the conditional probability $p_{\\\\theta}(y|x)$ may have multiple modes, one for each good route. This formulation allows us to represent the multi-modal distribution effectively. The variable $z$ decides which route to pick. The function $f_{\\\\boldsymbol{\\\\theta}}(x,z)$ returns one route that meets the demand of input $x$ and is randomly selected by $\\\\mathcal{Z}$ . If $p_{\\\\theta}(y|x)$ has $k$ modes, the space of $z$ will be split into $k$ regions where variable $z$ in every region will be mapped to one mode in $p_{\\\\theta}(y|x)$ .  \\n\\nWe use a sequence-to-sequence neural network to model the function $f_{\\\\theta}(x,z)$ . Assume the input variables $x,\\\\ z$ , and the output $y$ are all represented in sequential forms $x=$ $(x_{1},x_{2},\\\\ldots,x_{T})$ ,$z\\\\,=\\\\,(z_{1},z_{2},\\\\dots,z_{T})$ and $y\\\\,=\\\\,(y_{1},y_{2},\\\\ldots,y_{T})$ .The sequence-to-sequence model is made of an encoder and a decoder. The sequential encoder receives $x$ and outputs a representation vector for input $x$ .The sequential decoder receives the output of the encoder as well as $z$ and outputs $y$ in $T$ steps, where $T$ refers to the maximum length for variable $y$ . In the $k$ -th step ( $1\\\\leq k\\\\leq T$ ), the decoder network takes $z_{k}$ , and the hidden vector $h_{k-1}$ from the previous step as inputs, and outputs a score vector $o_{k}=(o_{k1},o_{k2},\\\\dots,o_{k D_{k}})$ of length $D_{k}\\\\;=\\\\;|D(y_{k})|$ .Here, $o_{k}$ corresponds to the un-normalized likelihoods of each value that variable $y_{k}$ can take. The softmax function is then applied to get the normalized probability:  \\n\\n$$\\np_{k j}=p\\\\left(y_{k}=v_{j}|x,h_{k-1}\\\\right)={\\\\frac{\\\\exp(o_{k j})}{\\\\sum_{j^{\\\\prime}=1}^{D_{k}}\\\\exp(o_{k j^{\\\\prime}})}},\\\\qquad{\\\\mathrm{for~}}j=1,2,\\\\ldots,D_{k}.\\n$$  \\n\\n$p_{k j}$ is the probability that variable $y_{k}$ takes the $j$ -th value $v_{j}$ . Assume the prior distribution $p(z_{k})$ is the uniform distribution in $(0,1)$ , denoted by $\\\\mathcal{U}(0,1)$ . Variable $z_{k}$ is sampled from $\\\\mathcal{U}(0,1)$ and is used to determine the value for $y_{k}$ according to the probability distribution vector $p_{k}=(p_{k1},p_{k2},...\\\\,,p_{k D_{k}})$ . Let $P_{k1},P_{k2},...,P_{k(D_{k}+1)}$ be the cumulative probabilities:  \\n\\n$$\\nP_{k j}=\\\\left\\\\{\\\\!\\\\!\\\\begin{array}{l l}{0}&{\\\\mathrm{for~}j=1,}\\\\\\\\ {\\\\sum_{j^{\\\\prime}=1}^{j-1}p_{k j^{\\\\prime}}}&{\\\\mathrm{for~}j=2,3,...\\\\,,D_{k},}\\\\\\\\ {1}&{\\\\mathrm{for~}j=D_{k}+1.}\\\\end{array}\\\\!\\\\!\\\\right.\\n$$  \\n\\n$y_{k}$ is set to t e$v_{j}$ if and only if $z_{k}\\\\ \\\\in\\\\ \\\\left[P_{k j},P_{k(j+1)}\\\\right)$ '\\x01.Notice that because $z_{k}$ is sampled from U$\\\\mathcal{U}(0,1)$ 1), the probability that $y_{k}$ takes the $j$ -th value $v_{j}$ is exactly $p_{k j}$ . Aside from producing the value for $y_{k}$ in the $k$ -th step, the sequence-to-sequence neural net also produces the hidden-state vector $h_{k}$ at the $k$ -th step, which is used by the neural net again in the subsequent $(k+1)$ -th step. The overall architecture of the sequence-to-sequence model can be seen in Figure 4.  \\n\\nThe training process of the sequence-to-sequence model is to minimize a pre-defined loss function, or an additional discriminator neural net, which penalizes the differences of the predicted structure $f_{\\\\theta}(x,z)$ and the observed structure $y$ . Here $f_{\\\\theta}(x,z)$ is a predicted sequence obtained from the above process. Given a training data set $\\\\mathcal{D}^{t r}=\\\\{(\\\\boldsymbol{x}^{(i)},\\\\boldsymbol{y}^{(i)})\\\\}_{i=1}^{N}$ ,the learning objective is to minimize the loss function:  \\n\\n  \\nFigure 2: Illustration of Multi-valued Decision Diagrams (MDDs) for decision variables $x_{1},x_{2},x_{3}$ .(a) An exact MDD with all variable assignments satisfying two constraints: all-diff $(x_{1},x_{2},x_{3})$ and $x_{1}\\\\neq v_{1}$ .(b) A width-1 relaxed MDD for the exact MDD in (a). (c) A width-2 relaxed MDD, which is formed by combining nodes $u_{4}$ and $u_{5}$ of the MDD in (a).  \\n\\n$$\\n\\\\mathcal{L}(\\\\theta)=\\\\frac{1}{N}\\\\sum_{i=1}^{N}\\\\mathbb{E}_{z^{(i)}}\\\\left[\\\\ell\\\\left(f_{\\\\theta}\\\\left(x^{(i)},z^{(i)}\\\\right),y^{(i)}\\\\right)\\\\right].\\n$$  \\n\\nHere $\\\\ell(\\\\cdot,\\\\cdot)$ can be a predefined loss function that measures the mismatch between the predicted and observed structures. Function $\\\\ell(\\\\cdot,\\\\cdot)$ can also be represented as a discriminator network, which leads to the training of a generative adversarial network. The parameters $\\\\theta$ are updated via gradient descent, i.e. ,$\\\\theta^{t+1}=\\\\theta^{t}{-}\\\\eta\\\\nabla{\\\\mathcal{L}}(\\\\theta)$ , where $\\\\eta$ denotes the learning rate.\", 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633347470792, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 3, 'chunk_text': '# 2.2 Decision Diagrams\\nDecision diagrams were originally introduced to compactly represent Boolean functions in a graphical form (Akers, 1978; Bryant, 1986). Since then, they have been widely used in the context of verification and configuration problems (Wegener, 2000). More recently, they have been used successfully as an optimization tool, by representing the set of solutions to combinatorial optimization problems (Bergman et al., 2016b; van Hoeve, 2022).  \\n\\nDecision diagrams are defined with respect to a sequence of decision variables $x_{1},\\\\ldots,x_{n}$ .Variable $x_{i}$ has a domain of possible values $D(x_{i})$ , for $i=1,2,\\\\dots,n$ . A decision diagram is a directed acyclic graph, with $n+1$ layers of nodes. Layer 1 contains a single node $s$ ,called the root. Layer $n+1$ also contains a single node $t$ , called the terminal. An arc from a node in layer $i$ to a node in layer $i+1$ represents a possible assignment of variable $x_{i}$ to a value in its domain and is therefore associated with a label in $D(x_{i})$ . For an arc $e(v,u)$ ,we use ${\\\\tt v a l}(v,u)\\\\,\\\\in\\\\,D(x_{i})$ to represent the assigned label for variable $x_{i}$ . For a node $\\\\upsilon$ in layer $i$ , we use $\\\\mathtt{v a l}(v)\\\\subseteq D(x_{i})$ to represent the union of the values of each arc starting from node $v$ ,i.e. ,${\\\\mathsf{v a l}}(v)\\\\,=\\\\,\\\\cup_{e(v,u)}\\\\{{\\\\mathsf{v a l}}(v,u)\\\\}$ . In other words, ${\\\\tt v a l}(v)$ represents the possible value assignments for the decision variable $x_{i}$ at node $\\\\upsilon$ . Each path from the root $s$ to the terminal $t$ represents a solution, i.e. , a complete variable assignment. In this paper, we consider variables with domains of categorical values, which result in so-called multi-valued decision diagrams (MDDs) (Wegener, 2000). See Figure 2 for an example.  \\n\\n  \\nFigure 3: Node splitting and arc filtering for MDDs for variables $x_{1},x_{2},x_{3}$ .(a) A width-1 relaxed MDD as in Figure 2(b). (b) Split node $u_{1}$ into $\\\\hat{u}_{1}$ and $\\\\tilde{u}_{1}$ .(c) Filter arcs $e(\\\\hat{u}_{1},u_{2})\\\\;=\\\\;v_{2},e(\\\\tilde{u}_{1},u_{2})\\\\;=\\\\;v_{3}$ that violate the constraint all-diff $(x_{1},x_{2},x_{3})$ .The arcs in dashed lines are removed. (d) A width-2 relaxed MDD after one iteration of node splitting and arc filtering.  \\n\\nExact Decision Diagrams. Given a set of constraints $\\\\mathcal{C}$ , the MDD $\\\\mathcal{M}$ is said to be exact with respect to $\\\\mathcal{C}$ if and only if every path that leads from the root node $s$ to the terminal node $t$ in $\\\\mathcal{M}$ is a variable assignment satisfying all constraints in $\\\\scriptscriptstyle\\\\mathcal{C}$ . Conversely, every valid variable assignment can be found as a path from $s$ to $t$ in $\\\\mathcal{M}$ .  \\n\\nRelaxed Decision Diagrams. Since exact decision diagrams can grow exponentially large, relaxed decision diagrams were introduced to limit their size (Andersen et al., 2007). The set of paths in a relaxed decision diagram forms a superset of the paths in the associated exact decision diagram. Relaxed MDDs are often defined with respect to the maximum layer width, which is the number of nodes in its largest layer.  \\n\\nVariable Ordering. In general, the size of an exact decision diagram is known to strongly depend on the variable ordering (Friedman and Supowit, 1990). In our applications, however, we consider sequential decision processes which follow a natural prescribed ordering. Our approach can also be applied to more general decision problems, in which case the variable ordering needs to be considered when compiling the MDD.  \\n\\nExample 1 Figure $\\\\mathcal{Q}$ demonstrates several MDDs. Let $x_{1},x_{2},x_{3}$ be a sequence of decision variables with domain $D(x_{1})\\\\ =\\\\ D(x_{2})\\\\ =\\\\ D(x_{3})\\\\ =\\\\ \\\\{v_{1},v_{2},v_{3}\\\\}$ .The constraint $a\\\\,\\\\!\\\\ l\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,$ restricts the values of $x_{1},x_{2}$ and $x3$ to be all different, i.e., they form a permutation. The other constraint is $x_{1}\\\\neq v_{1}$ . (1) Exact MDD. The set of feasible permutations is $\\\\{(v_{2},v_{1},v_{3})$ ,$(v_{2},v_{3},v_{1})$ ,$(v_{3},v_{2},v_{1})$ ,$(v_{3},v_{1},v_{2})\\\\}$ . Figure 2(a) depicts the exact MDD that encodes all permutations satisfying the two constraints. (2) Relaxed MDD. Figure 2(b) is a width-1 relaxed MDD and Figure $\\\\mathcal{Q}(c)$ is a width$\\\\it{2}$ relaxed MDD. The set of paths in the relaxed MDD forms a superset of all feasible permutations. As an illustration, Figure 2(c) contains two infeasible solutions $\\\\{(v_{3},v_{1},v_{1}),(v_{2},v_{2},v_{2})\\\\}$ . (3) Variable ordering. All the MDDs in Figure 2 have the same variable ordering of $\\\\pi=(1,2,3)$ , meaning that the MDD first expands on variable $x_{1}$ , then $x_{2}$ , finally $x_{3}$ .  \\n\\nDecision Diagram Compilation. Decision diagrams can be compiled via a repeated process of node splitting and arc filtering from a width-1 relaxed MDD (Andersen et al., 2007; Bergman et al., 2016a). Arc filtering removes arcs that lead to infeasible solutions, while node splitting increases the size of the decision diagram by splitting one node into two or more nodes. In practice, one can reach an exact MDD by repeatedly going through the splitting and filtering processes from a width-1 MDD. We refer to Cir´e and van Hoeve (2013) for the detailed process of MDD compilation for sequential decision problems.  \\n\\nExample 2 Figure 3 demonstrates one possible process of applying the node splitting and arc filtering steps. We re-use the example in Figure $\\\\mathcal{Q}(b)$ as the initial MDD in Figure $\\\\mathcal{Y}(a)$ ,which depicts a width-1 relaxed MDD before compilation. The constraint to be applied is $a\\\\o{l}\\\\,\\\\o{l}-d i\\\\,\\\\b{f}\\\\b{f}(x_{1},x_{2},x_{3})$ , i.e., the assignments of variables $x_{1},x_{2},x_{3}$ should be pairwise different. The node $u_{1}$ in Figure $\\\\mathcal{Y}(a)$ is split into two nodes $\\\\hat{u}_{1},\\\\tilde{u}_{1}$ in Figure $\\\\mathcal{Y}(b)$ . The incoming arc $e(s,u_{1})$ with labe $v_{2}$ is assigned to node $\\\\hat{u}_{1}$ and the other incoming arc $e(s,u_{1})$ with label $v_{3}$ is assigned to node $\\\\tilde{u}_{1}$ . The outgoing arcs of node $u_{1}$ are copied for the two nodes. In Figure $\\\\mathcal{Y}(c)$ , the arc filtering process checks if certain variable assignments violate constraints for the two nodes. Arc $e(\\\\hat{u}_{1},u_{2})=v_{2}$ is not compatible with the previous arc $e(s,\\\\hat{u}_{1})$ with label $v_{2}$ because it violates $a\\\\o{l}\\\\,\\\\o{l}-d i\\\\,\\\\b{f}\\\\b{f}(x_{1},x_{2},x_{3})$ . Thus it is removed. For the same reason, arc $e(\\\\tilde{u}_{1},u_{2})=v_{3}$ is also removed. (d) We get a width$\\\\boldsymbol{\\\\mathscr{Q}}$ relaxed MDD after splitting node $u_{1}$ and filtering the arcs.', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633380500938, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 4, 'chunk_text': '# 3. Constraint Reasoning Embedded Structured Prediction\\nCore-Sp is motivated by the lack of constraint satisfaction in sequence-to-sequence structured prediction models. The key idea of Core-Sp is the correspondence between the predicted outcomes of a sequence-to-sequence model and a path in a multi-valued decision diagram (MDD). Figure 4 provides an example. In this example, the sequence-to-sequence model outputs a sequence of variable assignments $y_{1}=v_{2}$ ,$y_{2}=v_{3}$ ,$y_{3}=v_{1}$ in Figure 4(a), which exactly corresponds to the highlighted blue path in the MDD in Figure 4(b). However, the sequence-to-sequence model is also likely to output a variable assignment with no correspondence to the MDD. For example, if the neural model in Figure 4(a) outputs $y_{1}\\\\,=\\\\,v_{2}$ ,$y_{2}~=~v_{3}$ ,$y_{3}\\\\,=\\\\,v_{2}$ , there is no corresponding path in the MDD in Figure 4(b). This illustrates the case where the output of the sequence-to-sequence model violates the all-diff constraint. Indeed, neural network-based models for structured prediction problems are not guaranteed to satisfy constraints as defined in Equation (1), which forms a key limitation of state-of-the-art structured prediction models.  \\n\\nCore-Sp ensures constraint satisfaction of the neural network prediction by limiting the values that each variable can take following the flow of the MDD. Suppose we set $y_{1}=v_{2}$ Figure 4: Illustration of (a) a sequence-to-sequence model which generates an output corresponding to (b) a path in the multi-valued decision diagram. (a) A sequenceto-sequence model receives input $x$ and random variables $\\\\mathcal{Z}$ , and outputs $y_{1}=v_{2}$ ,$y_{2}=v_{3}$ and $y_{3}=v_{1}$ in three steps. (b) The assignment $\\\\left(y_{1},y_{2},y_{3}\\\\right)=\\\\left(v_{2},v_{3},v_{1}\\\\right)$ corresponds to path $s\\\\;{\\\\xrightarrow{v_{2}}}\\\\;u_{1}\\\\;{\\\\xrightarrow{v_{3}}}\\\\;u_{4}\\\\;{\\\\xrightarrow{v_{1}}}\\\\;t$ −→ −→ −→ in the multi-valued decision diagram.  \\n\\n  \\n\\nand $y_{2}=v_{3}$ in Figure 4(b) and arrive at node $u_{4}$ , the only valid option for $y3$ is to set $y_{3}=v_{1}$ . The other options $y_{3}=v_{2}$ or $y_{3}=v_{3}$ lead to constraint violations. Hence CoreSp masks out the choices of $y_{3}=v_{2}$ and $y_{3}=v_{3}$ for the sequence-to-sequence model. In this way, Core-Sp addresses a key limitation of structured prediction models. We next provide the details of Core-Sp .', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}]\n"
     ]
    }
   ],
   "source": [
    "# Test query_by_paper_id API\n",
    "print(\"\\nTesting /query_by_paper_id API...\")\n",
    "paper_id_params = {\"paper_id\": \"64a29654d68f896efa29af31\", \"top_k\": 5}\n",
    "data = call_api(\"query_by_paper_id\", params=paper_id_params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a9681ec9-c931-4011-9daf-f1a99978f94a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Testing /query_by_title API...\n",
      "[SUCCESS] query_by_title - Response:\n",
      "[{'id': 454846633242875330, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 0, 'chunk_text': '# Constraint Reasoning Embedded Structured Prediction\\nNan Jiang   \\nDepartment of Computer Science Purdue University   \\nWest Lafayette, Indiana, USA.  \\n\\n Maosen Zhang ByteDance Beijing, China.  \\n\\n Willem-Jan van Hoeve Tepper School of Business Carnegie Mellon University Pittsburgh, Pennsylvania, USA.  \\n\\n  \\n\\nYexiang Xue   \\nDepartment of Computer Science Purdue University   \\nWest Lafayette, Indiana, USA.  \\n\\n Editor: Maya Gupta\\n\\n# Abstract\\nMany real-world structured prediction problems need machine learning to capture data distribution and constraint reasoning to ensure structure validity. Nevertheless, constrained structured prediction is still limited in real-world applications because of the lack of tools to bridge constraint satisfaction and machine learning. In this paper, we propose CO nstraint RE asoning embedded Structured Prediction ( Core-Sp ), a scalable constraint reasoning and machine learning integrated approach for learning over structured domains. We propose to embed decision diagrams, a popular constraint reasoning tool, as a fullydifferentiable module into deep neural networks for structured prediction. We also propose an iterative search algorithm to automate the searching process of the best Core-Sp structure. We evaluate Core-Sp on three applications: vehicle dispatching service planning, if-then program synthesis, and text2SQL generation. The proposed Core-Sp module demonstrates superior performance over state-of-the-art approaches in all three applications. The structures generated with Core-Sp satisfy 100% of the constraints when using exact decision diagrams. In addition, Core-Sp boosts learning performance by reducing the modeling space via constraint satisfaction.  \\n\\nKeywords: Constraint Reasoning, Decision Diagrams, Structured Prediction.', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633277740484, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 1, 'chunk_text': '# 1. Introduction\\nThe emergence of large-scale constraint reasoning and machine learning technologies have impacted virtually all application domains, including marketing, linguistics, operations, retail, robotics, and health care. Constraint reasoning has traditionally been applied to building prescriptive models that generate solutions for strategic, tactical, or operational use (Choi et al., 2012). It requires a precise problem description and is usually difficult to be made flexible to the evolving data distributions. Machine learning, on the other hand, has been applied primarily to build predictive models, such as classifications or regressions (Michalski and Anderson, 1984; Bishop, 2007). While the structure of a machine learning model (like a neural network) must be designed, the actual model parameters are learned automatically via gradient descent algorithms. This gives machine learning models the flexibility to adapt to the evolving data distributions. Nevertheless, it is difficult to enforce constraints on the output of machine learning models. Many real-world applications are beyond the reach of constraint reasoning or machine learning alone.  \\n\\n  \\nFigure 1: (a) Our proposed Core-Sp framework embeds constraint reasoning in machine learning for structured prediction. We demonstrate the effectiveness of Core-Sp on vehicle dispatching service, if-then program synthesis, and Text2SQL generation tasks. (b) At a high level, Core-Sp (in orange colored box) is a fully differentiable layer that simulates a path descending in the corresponding decision diagram. Core-Sp filters out the infeasible output from the structured output to ensure constraint satisfaction.  \\n\\nIn this paper, we focus on structured prediction problems, which is a class of learning problems requiring both constraint reasoning and machine learning. It expands the output space of classification problems into high-dimensional structured space. Structured prediction has diverse application domains, ranging from natural language processing (Socher et al., 2013), social network analysis (Xiang and Neville, 2013), and ecological modeling (Tang et al., 2018; Chen et al., 2018). The applications we consider in this paper all require tight integration of constraint reasoning and machine learning. Our first application vehicle dispatching service planning is to recommend a route that satisfies the daily service needs as well as meeting the drivers’ preferences. Historical data may reveal that the drivers do not follow common stylized objectives such as minimizing distance or time. Therefore standard constraint reasoning tools, e.g. , solvers for the traveling salesman problem, cannot be applied. While we need machine learning to capture the drivers’ objective functions, pure machine learning-based approaches are insufficient because they often generate routes that violate delivery requests. Our second and third applications are program synthesis from natural language , which clearly requires machine learning to generate structured programs. Nevertheless, a pure learning approach cannot enforce the syntactic and semantic rules of those programs.  \\n\\nWe propose Co nstraint Re asoning embedded Structured Prediction ( Core-Sp ), a scalable constraint reasoning and machine learning integrated approach for learning over the structured domains. The main idea is to augment structured predictive models with a constraint reasoning module that represents physical and operational requirements. Specifically, we propose to embed decision diagrams (Akers, 1978; Bryant, 1986), a popular constraint reasoning tool, as a fully-differentiable module into deep neural networks. A decision diagram is a compact graphical representation of the constraints. It encodes each solution (an assignment of values to variables satisfying the constraints) as a path from the root to the terminal in the diagram. Core-Sp regards the neural network predictions as the simulation of descending along a path in the decision diagram. To ensure constraint satisfaction, Core-Sp filters out variable assignments from the neural network predictions that violate constraints. With the integration of Core-Sp , we provide structured prediction models with constraint satisfaction assurances. Moreover, structured prediction models with the Core-Sp layer enjoy a smaller prediction space than traditional structured prediction approaches, allowing our approach to learn faster in training and generalize better in testing. See Figure 1(a) for our proposed Core-Sp model which integrates constraint reasoning and machine learning for the three application domains. The high-level idea of Core-Sp is illustrated in Figure 1(b).  \\n\\nPrevious approaches have considered regularizing machine learning with constraint reasoning in various application domains. Within the broader context of learning constrained models, the work of Coletta et al. (2003); Lallouet et al. (2010); Beldiceanu and Simonis (2012); Bessiere et al. (2017); Addi et al. (2018) have studied automating the constraint acquisition process from historic data or (user-)generated queries. These approaches use partial or complete examples to identify the constraints that can be added to the model. The type of constraints that can be learned depends on the formulation. Several works (Punyakanok et al., 2004; Roth and Yih, 2005; Amos and Kolter, 2017; Ferber et al., 2020) enable learning in a constrained domain via encoding mathematical programming, such as quadratic programming or mixed integer linear programming, as a neural network layer. Deutsch et al. (2019) propose to formulate the output space as an automata. They use the constraints to prune all the invalid transitions in the automata to ensure the validity of the structured outputs. In addition, constraints imposed by a knowledge graph have been embedded into the neural network as differentiable layers (Peters et al., 2019; Wu et al., 2017). Zeng et al. (2021) and Heim (2019) enforce physical constraints or expert inputs as soft constraints. We will illustrate the difference between our approach and these methods in Section 3.2. A different approach is to embed a machine learning model into optimization, e.g. , by extending a constraint system with appropriate global constraints. For example, Lallouet and Legtchenko (2007) integrate neural networks and decision trees with constraint programming, while Lombardi et al. (2017) and Lombardi and Gualandi (2016) introduce a “Neuron” global constraint that represents a pre-trained neural network. Another series of approaches based on grammar variational autoencoders (Kusner et al., 2017; Dai et al., 2018; Jin et al., 2018) use neural networks to encode and decode from the parse-tree of a context-free grammar to generate discrete structures. Such approaches are used to generate chemical molecule expressions, which represent a structured domain. Machine learning approaches have also been used to solve constraint reasoning and optimization problems. This includes the works of Galassi et al. (2018) and Vinyals et al. (2015), which use neural networks to extend partial solutions to complete ones. Bello et al. (2017) handle the traveling salesman problem by framing it as reinforcement learning. Selsam et al. (2019) proposes to learn an SAT solver from single-bit supervision. Approaches based on neural Turing machines (Graves et al., 2016) employ neural networks with external memory for discrete structure generation. More recently, Khalil et al. (2017) tackle the combinatorial optimization problems in graphs, by employing neural networks to learn the heuristics in the backtrack-free search. There is also a recent trend to synthesize programs using machine learning (Guu et al., 2017; Shi et al., 2019).  \\n\\nIn experimental analysis, we demonstrate the effectiveness of Core-Sp on the following three applications: (1) Vehicle Dispatching Service Planning : a route planning problem that recommends routes to drivers to meet the service needs while satisfying the drivers’ preferences. The implicit preferences of drivers are learned from the historical traveling data. The input of this problem is the daily service requests. The output is the permutations of the service locations, representing the sequential order that the locations should be visited by the drivers. This task requires machine learning models to capture drivers’ preferences from the traveling data, and constraint reasoning to ensure the satisfaction of service requests. (2) If-then Program Synthesis : the task is to automatically synthesize conditional programs from the natural language. Automatic program synthesis tools are useful to streamline the program of a few online services such as IFTTT and Zapier. The if-then program is in the form of: if trigger function happens in the trigger service , then take the action function from the action service . The machine learning task, therefore, is to predict the quadruple ( trigger service ,trigger function ,action service ,action function ). This application again requires machine learning to understand the semantics of the natural language, as well as constraint reasoning to satisfy the syntactic rules of the programs. (3) Text2SQL Generation : our last application is to automatically generate SQL queries that extract information from a database to answer a question posed in natural language. The neural model is used to understand the user’s queries in natural language while the constraint reasoning tool is applied to ensure the model generates grammaticallyvalid SQL queries.  \\n\\nOur proposed Core-Sp framework demonstrates superior performance against the stateof-the-art approaches in all three applications. First, the structures generated by Core-Sp are better in constraint satisfaction. In vehicle service dispatching, all Core-Sp generated routes are valid, while a conditional generative adversarial network (cGAN) without CoreSp generates on average less than $1\\\\%$ of valid routes when handling medium-sized delivery requests. We also apply a post-processing step (Deudon et al., 2018) to boost cGAN’s performance, but it cannot handle the complexity brought by the large combinatorial space of the routing problem. Its performance quickly defaults to the case without post-processing as the number of delivery locations increases. For if-then program synthesis, the percentage of valid programs produced increased from 88% to 100% with the Core-Sp module incorporated into the state-of-the-art LatentAttention model (Liu et al., 2016). For Text2SQL, the percentage of valid SQL queries increased from 83 .7% to 100% with Core-Sp incorporated into the state-of-the-art SQLNova model (Hwang et al., 2019) on a hard testing set. Core-Sp also improves the learning performance of structured prediction models. We show that the routes generated by Core-Sp better fulfill drivers’ preferences than cGAN without Core-Sp . In if-then program synthesis, Core-Sp module leads to approximately $2.0\\\\%$ improvement in accuracy compared with the state-of-the-art LatentAttention model and converges to models with higher accuracy in fewer training epochs. In Text2SQL generation, the Core-Sp module improves around 4 .2% in execution accuracy and 1 .9% in logical accuracy against SQLNova on a challenging test set.', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633313916358, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 2, 'chunk_text': \"# 2.1 Structured Prediction\\nStructured prediction expands the output space of classification problems into a highdimensional combinatorial space (Bakır et al., 2007). Specifically, given a set of inputoutput samples $\\\\mathcal{D}^{t r}\\\\,=\\\\,\\\\{({\\\\boldsymbol{x}}^{(i)},{\\\\boldsymbol{y}}^{(i)})\\\\}_{i=1}^{N}$ drawn i.i.d. from some unknown distribution over the space $\\\\mathcal X\\\\times\\\\mathcal Y$ , a structured pr tion model learns a conditional distribution $p_{\\\\theta}(y|x)$ ,for all $(x,y)\\\\,\\\\in\\\\,\\\\mathcal{X}\\\\times\\\\mathcal{Y}$ from data D$\\\\mathcal{D}^{t r}$ , where $\\\\theta$ denotes the parameters of the structured prediction model. Note that the output space ${\\\\boldsymbol{\\\\mathcal{D}}}=\\\\{0,1\\\\}^{\\\\iota}$ is a high dimensional space of combinatorial structures. The three applications we consider in this paper are all structured prediction problems. In vehicle dispatching service planning, the structured outputs are the delivery routes on a map. In if-then program synthesis, the structured outputs are the programs that complete web-service tasks. In Text2SQL generation, the structured outputs are the SQL queries that follow the SQL grammar.  \\n\\nIn the literature, various approaches have been proposed for structured prediction problems. The classifier chain approach (Read et al., 2015) decomposes the joint likelihood into a product of conditionals and reduces the structured prediction problem into a series of binary prediction problems. In this approach, the error tends to propagate along the classifier chain, which limits its effectiveness (Dembczynski et al., 2010). Energy-based modeling, such as conditional random fields (Lafferty et al., 2001; Geman and Geman, 1984) and structured prediction energy networks (Belanger and McCallum, 2016) learn to assign a high likelihood to structures that exist in the training data set while keeping the likelihood low for unseen structures. Constraints can be incorporated into these models as prior terms in the energy function but approximated inference is required to compute the intractable partition function, which often hinders their scalability. Another line of research uses structured support vector machines (Tsochantaridis et al., 2005), which apply hinge loss and row generation approaches for structured prediction; however, these were superseded in performance by later neural-network-based approaches. Recently, generative models, such as conditional generative adversarial networks (Mirza and Osindero, 2014; Goodfellow et al., 2014), flow models (Rezende and Mohamed, 2015), and sequence-to-sequence models (Sutskever et al., 2014) have become increasingly popular for structured prediction. These models use highly flexible neural networks to increase model capability. The over-parameterized networks with gradient descent-based optimization can learn better representation for the structures than the classic shallow models. However, it is not straightforward to enforce constraints into the neural network-based models.  \\n\\nConstraints in Structured Prediction. Often the structured output space $\\\\mathcal{V}$ is subject to additional constraints $\\\\scriptscriptstyle\\\\mathcal{C}$ . The conditional probability that $y$ takes values that violate the (physical) constraints $\\\\mathcal{C}$ given the input $x$ is zero. Such information is known prior to the training of the machine learning model. Formally, we have:  \\n\\n$$\\np(y|x)\\\\left\\\\{\\\\int>0\\\\quad{\\\\mathrm{if~}}y{\\\\mathrm{~satisfies~}}{\\\\mathcal{C}},\\\\right.\\n$$  \\n\\nTake the first task discussed in this paper as an example. A valid delivery route should cover all the requested locations and should only visit each location once. Thus, the machine learning model should assign zero probability to those invalid routes. Notice that the constraints are often intricate and the inference problem of finding a valid structure satisfying constraints cannot be decomposed into independent small problems. After learning, the inference problem is to predict the structured output $y$ given the input $x$ . Such inference problems can be solved by either Maximum A Posteriori (MAP) inference, e.g. , computing $m a x_{y}\\\\ p(y|x)$ or marginal inference, e.g. , computing $\\\\mathbb{E}_{y}[p(y|x)]$ . Learning structured prediction models involves solving the inference problems within the learning loop, hence having an even higher complexity.  \\n\\nCombinatorial constraints render both the inference and the learning problems highly intractable. Indeed, much effort has been made to improve the efficiency of both the inference and learning problems (Pan and Srikumar, 2018; Bello et al., 2020). For example, Niculae et al. (2018) propose the sparseMAP function which solves the inference problem by returning a few sparse structures that attain high likelihoods. This inference method sits between the MAP and marginal inference. In their problem setup, sparseMAP can be solved via quadratic programming. However, combinatorial constraints considered in this paper make the inference problem non-convex, even for a fixed structured prediction model, let alone the more challenging learning problem. Overall, constrained structured prediction presents two main challenges. The first is the sample complexity , since massive data is needed to learn an accurate model in an exponentially large space. The second is the computational complexity , since it is combinatorially intractable (unless P=NP) to generate structured outputs subject to complicated constraints.  \\n\\nSequence-to-sequence Structured Prediction. Our proposed Core-Sp method is designed to extend sequence-to-sequence models, which are recently proposed popular structured prediction models (Sutskever et al., 2014). The sequence-to-sequence model uses the re-parameterization trick to model the conditional probability $p_{\\\\theta}(y|x)$ , where $x\\\\in\\\\mathscr{X}$ denotes the input variables and $y\\\\in\\\\mathcal{V}$ is the structured output. Here $\\\\theta$ denotes the parameters of the neural model. Instead of modeling the probability $p_{\\\\theta}(y|x)$ directly, the model introduces an additional random variable $\\\\mathcal{Z}$ and models it as a deterministic transformation from random variable $\\\\mathcal{Z}$ and evidence $x$ to the output $y$ . In other words, the conditional probability $p_{\\\\theta}(y|x)$ is an integral over random variable $z$ in the following way:  \\n\\n$$\\n\\\\begin{array}{c}{{p_{\\\\theta}(y|x)=\\\\displaystyle\\\\int p_{\\\\theta}(y|x,z)p(z)\\\\;d z,}}\\\\\\\\ {{p_{\\\\theta}(y|x,z)=\\\\mathbb{1}\\\\{y=f_{\\\\theta}(x,z)\\\\},}}\\\\end{array}\\n$$  \\n\\nwhere we assume $\\\\mathcal{Z}$ is from a known prior probability distribution $p(z)$ . As a result, we only need to model $p_{\\\\theta}(y|x,z)$ for the overall model $p_{\\\\theta}(y|x)$ . We further assume that $p_{\\\\theta}(y|x,z)$ is given in the form of a deterministic function. We let $f_{\\\\theta}(x,z)\\\\in\\\\mathcal{D}$ be a deterministic mapping from inputs $(x,z)$ to an output in the structured space $\\\\boldsymbol{y}$ . The indicator function $\\\\mathbb{I}\\\\{\\\\cdot\\\\}$ evaluates to $1$ if and only if $y=f_{\\\\boldsymbol{\\\\theta}}(x,z)$ . This formulation is closely related to the generative adversarial network and gives us high flexibility to model multi-modal distributions. Take the vehicle dispatching service planning as an example. The input $x$ is the daily service requests and $y$ is the suggested dispatching route. There can be several routes that meet the service demands and satisfy the driver’s underlying preference function. In this case, the conditional probability $p_{\\\\theta}(y|x)$ may have multiple modes, one for each good route. This formulation allows us to represent the multi-modal distribution effectively. The variable $z$ decides which route to pick. The function $f_{\\\\boldsymbol{\\\\theta}}(x,z)$ returns one route that meets the demand of input $x$ and is randomly selected by $\\\\mathcal{Z}$ . If $p_{\\\\theta}(y|x)$ has $k$ modes, the space of $z$ will be split into $k$ regions where variable $z$ in every region will be mapped to one mode in $p_{\\\\theta}(y|x)$ .  \\n\\nWe use a sequence-to-sequence neural network to model the function $f_{\\\\theta}(x,z)$ . Assume the input variables $x,\\\\ z$ , and the output $y$ are all represented in sequential forms $x=$ $(x_{1},x_{2},\\\\ldots,x_{T})$ ,$z\\\\,=\\\\,(z_{1},z_{2},\\\\dots,z_{T})$ and $y\\\\,=\\\\,(y_{1},y_{2},\\\\ldots,y_{T})$ .The sequence-to-sequence model is made of an encoder and a decoder. The sequential encoder receives $x$ and outputs a representation vector for input $x$ .The sequential decoder receives the output of the encoder as well as $z$ and outputs $y$ in $T$ steps, where $T$ refers to the maximum length for variable $y$ . In the $k$ -th step ( $1\\\\leq k\\\\leq T$ ), the decoder network takes $z_{k}$ , and the hidden vector $h_{k-1}$ from the previous step as inputs, and outputs a score vector $o_{k}=(o_{k1},o_{k2},\\\\dots,o_{k D_{k}})$ of length $D_{k}\\\\;=\\\\;|D(y_{k})|$ .Here, $o_{k}$ corresponds to the un-normalized likelihoods of each value that variable $y_{k}$ can take. The softmax function is then applied to get the normalized probability:  \\n\\n$$\\np_{k j}=p\\\\left(y_{k}=v_{j}|x,h_{k-1}\\\\right)={\\\\frac{\\\\exp(o_{k j})}{\\\\sum_{j^{\\\\prime}=1}^{D_{k}}\\\\exp(o_{k j^{\\\\prime}})}},\\\\qquad{\\\\mathrm{for~}}j=1,2,\\\\ldots,D_{k}.\\n$$  \\n\\n$p_{k j}$ is the probability that variable $y_{k}$ takes the $j$ -th value $v_{j}$ . Assume the prior distribution $p(z_{k})$ is the uniform distribution in $(0,1)$ , denoted by $\\\\mathcal{U}(0,1)$ . Variable $z_{k}$ is sampled from $\\\\mathcal{U}(0,1)$ and is used to determine the value for $y_{k}$ according to the probability distribution vector $p_{k}=(p_{k1},p_{k2},...\\\\,,p_{k D_{k}})$ . Let $P_{k1},P_{k2},...,P_{k(D_{k}+1)}$ be the cumulative probabilities:  \\n\\n$$\\nP_{k j}=\\\\left\\\\{\\\\!\\\\!\\\\begin{array}{l l}{0}&{\\\\mathrm{for~}j=1,}\\\\\\\\ {\\\\sum_{j^{\\\\prime}=1}^{j-1}p_{k j^{\\\\prime}}}&{\\\\mathrm{for~}j=2,3,...\\\\,,D_{k},}\\\\\\\\ {1}&{\\\\mathrm{for~}j=D_{k}+1.}\\\\end{array}\\\\!\\\\!\\\\right.\\n$$  \\n\\n$y_{k}$ is set to t e$v_{j}$ if and only if $z_{k}\\\\ \\\\in\\\\ \\\\left[P_{k j},P_{k(j+1)}\\\\right)$ '\\x01.Notice that because $z_{k}$ is sampled from U$\\\\mathcal{U}(0,1)$ 1), the probability that $y_{k}$ takes the $j$ -th value $v_{j}$ is exactly $p_{k j}$ . Aside from producing the value for $y_{k}$ in the $k$ -th step, the sequence-to-sequence neural net also produces the hidden-state vector $h_{k}$ at the $k$ -th step, which is used by the neural net again in the subsequent $(k+1)$ -th step. The overall architecture of the sequence-to-sequence model can be seen in Figure 4.  \\n\\nThe training process of the sequence-to-sequence model is to minimize a pre-defined loss function, or an additional discriminator neural net, which penalizes the differences of the predicted structure $f_{\\\\theta}(x,z)$ and the observed structure $y$ . Here $f_{\\\\theta}(x,z)$ is a predicted sequence obtained from the above process. Given a training data set $\\\\mathcal{D}^{t r}=\\\\{(\\\\boldsymbol{x}^{(i)},\\\\boldsymbol{y}^{(i)})\\\\}_{i=1}^{N}$ ,the learning objective is to minimize the loss function:  \\n\\n  \\nFigure 2: Illustration of Multi-valued Decision Diagrams (MDDs) for decision variables $x_{1},x_{2},x_{3}$ .(a) An exact MDD with all variable assignments satisfying two constraints: all-diff $(x_{1},x_{2},x_{3})$ and $x_{1}\\\\neq v_{1}$ .(b) A width-1 relaxed MDD for the exact MDD in (a). (c) A width-2 relaxed MDD, which is formed by combining nodes $u_{4}$ and $u_{5}$ of the MDD in (a).  \\n\\n$$\\n\\\\mathcal{L}(\\\\theta)=\\\\frac{1}{N}\\\\sum_{i=1}^{N}\\\\mathbb{E}_{z^{(i)}}\\\\left[\\\\ell\\\\left(f_{\\\\theta}\\\\left(x^{(i)},z^{(i)}\\\\right),y^{(i)}\\\\right)\\\\right].\\n$$  \\n\\nHere $\\\\ell(\\\\cdot,\\\\cdot)$ can be a predefined loss function that measures the mismatch between the predicted and observed structures. Function $\\\\ell(\\\\cdot,\\\\cdot)$ can also be represented as a discriminator network, which leads to the training of a generative adversarial network. The parameters $\\\\theta$ are updated via gradient descent, i.e. ,$\\\\theta^{t+1}=\\\\theta^{t}{-}\\\\eta\\\\nabla{\\\\mathcal{L}}(\\\\theta)$ , where $\\\\eta$ denotes the learning rate.\", 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633347470792, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 3, 'chunk_text': '# 2.2 Decision Diagrams\\nDecision diagrams were originally introduced to compactly represent Boolean functions in a graphical form (Akers, 1978; Bryant, 1986). Since then, they have been widely used in the context of verification and configuration problems (Wegener, 2000). More recently, they have been used successfully as an optimization tool, by representing the set of solutions to combinatorial optimization problems (Bergman et al., 2016b; van Hoeve, 2022).  \\n\\nDecision diagrams are defined with respect to a sequence of decision variables $x_{1},\\\\ldots,x_{n}$ .Variable $x_{i}$ has a domain of possible values $D(x_{i})$ , for $i=1,2,\\\\dots,n$ . A decision diagram is a directed acyclic graph, with $n+1$ layers of nodes. Layer 1 contains a single node $s$ ,called the root. Layer $n+1$ also contains a single node $t$ , called the terminal. An arc from a node in layer $i$ to a node in layer $i+1$ represents a possible assignment of variable $x_{i}$ to a value in its domain and is therefore associated with a label in $D(x_{i})$ . For an arc $e(v,u)$ ,we use ${\\\\tt v a l}(v,u)\\\\,\\\\in\\\\,D(x_{i})$ to represent the assigned label for variable $x_{i}$ . For a node $\\\\upsilon$ in layer $i$ , we use $\\\\mathtt{v a l}(v)\\\\subseteq D(x_{i})$ to represent the union of the values of each arc starting from node $v$ ,i.e. ,${\\\\mathsf{v a l}}(v)\\\\,=\\\\,\\\\cup_{e(v,u)}\\\\{{\\\\mathsf{v a l}}(v,u)\\\\}$ . In other words, ${\\\\tt v a l}(v)$ represents the possible value assignments for the decision variable $x_{i}$ at node $\\\\upsilon$ . Each path from the root $s$ to the terminal $t$ represents a solution, i.e. , a complete variable assignment. In this paper, we consider variables with domains of categorical values, which result in so-called multi-valued decision diagrams (MDDs) (Wegener, 2000). See Figure 2 for an example.  \\n\\n  \\nFigure 3: Node splitting and arc filtering for MDDs for variables $x_{1},x_{2},x_{3}$ .(a) A width-1 relaxed MDD as in Figure 2(b). (b) Split node $u_{1}$ into $\\\\hat{u}_{1}$ and $\\\\tilde{u}_{1}$ .(c) Filter arcs $e(\\\\hat{u}_{1},u_{2})\\\\;=\\\\;v_{2},e(\\\\tilde{u}_{1},u_{2})\\\\;=\\\\;v_{3}$ that violate the constraint all-diff $(x_{1},x_{2},x_{3})$ .The arcs in dashed lines are removed. (d) A width-2 relaxed MDD after one iteration of node splitting and arc filtering.  \\n\\nExact Decision Diagrams. Given a set of constraints $\\\\mathcal{C}$ , the MDD $\\\\mathcal{M}$ is said to be exact with respect to $\\\\mathcal{C}$ if and only if every path that leads from the root node $s$ to the terminal node $t$ in $\\\\mathcal{M}$ is a variable assignment satisfying all constraints in $\\\\scriptscriptstyle\\\\mathcal{C}$ . Conversely, every valid variable assignment can be found as a path from $s$ to $t$ in $\\\\mathcal{M}$ .  \\n\\nRelaxed Decision Diagrams. Since exact decision diagrams can grow exponentially large, relaxed decision diagrams were introduced to limit their size (Andersen et al., 2007). The set of paths in a relaxed decision diagram forms a superset of the paths in the associated exact decision diagram. Relaxed MDDs are often defined with respect to the maximum layer width, which is the number of nodes in its largest layer.  \\n\\nVariable Ordering. In general, the size of an exact decision diagram is known to strongly depend on the variable ordering (Friedman and Supowit, 1990). In our applications, however, we consider sequential decision processes which follow a natural prescribed ordering. Our approach can also be applied to more general decision problems, in which case the variable ordering needs to be considered when compiling the MDD.  \\n\\nExample 1 Figure $\\\\mathcal{Q}$ demonstrates several MDDs. Let $x_{1},x_{2},x_{3}$ be a sequence of decision variables with domain $D(x_{1})\\\\ =\\\\ D(x_{2})\\\\ =\\\\ D(x_{3})\\\\ =\\\\ \\\\{v_{1},v_{2},v_{3}\\\\}$ .The constraint $a\\\\,\\\\!\\\\ l\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,$ restricts the values of $x_{1},x_{2}$ and $x3$ to be all different, i.e., they form a permutation. The other constraint is $x_{1}\\\\neq v_{1}$ . (1) Exact MDD. The set of feasible permutations is $\\\\{(v_{2},v_{1},v_{3})$ ,$(v_{2},v_{3},v_{1})$ ,$(v_{3},v_{2},v_{1})$ ,$(v_{3},v_{1},v_{2})\\\\}$ . Figure 2(a) depicts the exact MDD that encodes all permutations satisfying the two constraints. (2) Relaxed MDD. Figure 2(b) is a width-1 relaxed MDD and Figure $\\\\mathcal{Q}(c)$ is a width$\\\\it{2}$ relaxed MDD. The set of paths in the relaxed MDD forms a superset of all feasible permutations. As an illustration, Figure 2(c) contains two infeasible solutions $\\\\{(v_{3},v_{1},v_{1}),(v_{2},v_{2},v_{2})\\\\}$ . (3) Variable ordering. All the MDDs in Figure 2 have the same variable ordering of $\\\\pi=(1,2,3)$ , meaning that the MDD first expands on variable $x_{1}$ , then $x_{2}$ , finally $x_{3}$ .  \\n\\nDecision Diagram Compilation. Decision diagrams can be compiled via a repeated process of node splitting and arc filtering from a width-1 relaxed MDD (Andersen et al., 2007; Bergman et al., 2016a). Arc filtering removes arcs that lead to infeasible solutions, while node splitting increases the size of the decision diagram by splitting one node into two or more nodes. In practice, one can reach an exact MDD by repeatedly going through the splitting and filtering processes from a width-1 MDD. We refer to Cir´e and van Hoeve (2013) for the detailed process of MDD compilation for sequential decision problems.  \\n\\nExample 2 Figure 3 demonstrates one possible process of applying the node splitting and arc filtering steps. We re-use the example in Figure $\\\\mathcal{Q}(b)$ as the initial MDD in Figure $\\\\mathcal{Y}(a)$ ,which depicts a width-1 relaxed MDD before compilation. The constraint to be applied is $a\\\\o{l}\\\\,\\\\o{l}-d i\\\\,\\\\b{f}\\\\b{f}(x_{1},x_{2},x_{3})$ , i.e., the assignments of variables $x_{1},x_{2},x_{3}$ should be pairwise different. The node $u_{1}$ in Figure $\\\\mathcal{Y}(a)$ is split into two nodes $\\\\hat{u}_{1},\\\\tilde{u}_{1}$ in Figure $\\\\mathcal{Y}(b)$ . The incoming arc $e(s,u_{1})$ with labe $v_{2}$ is assigned to node $\\\\hat{u}_{1}$ and the other incoming arc $e(s,u_{1})$ with label $v_{3}$ is assigned to node $\\\\tilde{u}_{1}$ . The outgoing arcs of node $u_{1}$ are copied for the two nodes. In Figure $\\\\mathcal{Y}(c)$ , the arc filtering process checks if certain variable assignments violate constraints for the two nodes. Arc $e(\\\\hat{u}_{1},u_{2})=v_{2}$ is not compatible with the previous arc $e(s,\\\\hat{u}_{1})$ with label $v_{2}$ because it violates $a\\\\o{l}\\\\,\\\\o{l}-d i\\\\,\\\\b{f}\\\\b{f}(x_{1},x_{2},x_{3})$ . Thus it is removed. For the same reason, arc $e(\\\\tilde{u}_{1},u_{2})=v_{3}$ is also removed. (d) We get a width$\\\\boldsymbol{\\\\mathscr{Q}}$ relaxed MDD after splitting node $u_{1}$ and filtering the arcs.', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633380500938, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 4, 'chunk_text': '# 3. Constraint Reasoning Embedded Structured Prediction\\nCore-Sp is motivated by the lack of constraint satisfaction in sequence-to-sequence structured prediction models. The key idea of Core-Sp is the correspondence between the predicted outcomes of a sequence-to-sequence model and a path in a multi-valued decision diagram (MDD). Figure 4 provides an example. In this example, the sequence-to-sequence model outputs a sequence of variable assignments $y_{1}=v_{2}$ ,$y_{2}=v_{3}$ ,$y_{3}=v_{1}$ in Figure 4(a), which exactly corresponds to the highlighted blue path in the MDD in Figure 4(b). However, the sequence-to-sequence model is also likely to output a variable assignment with no correspondence to the MDD. For example, if the neural model in Figure 4(a) outputs $y_{1}\\\\,=\\\\,v_{2}$ ,$y_{2}~=~v_{3}$ ,$y_{3}\\\\,=\\\\,v_{2}$ , there is no corresponding path in the MDD in Figure 4(b). This illustrates the case where the output of the sequence-to-sequence model violates the all-diff constraint. Indeed, neural network-based models for structured prediction problems are not guaranteed to satisfy constraints as defined in Equation (1), which forms a key limitation of state-of-the-art structured prediction models.  \\n\\nCore-Sp ensures constraint satisfaction of the neural network prediction by limiting the values that each variable can take following the flow of the MDD. Suppose we set $y_{1}=v_{2}$ Figure 4: Illustration of (a) a sequence-to-sequence model which generates an output corresponding to (b) a path in the multi-valued decision diagram. (a) A sequenceto-sequence model receives input $x$ and random variables $\\\\mathcal{Z}$ , and outputs $y_{1}=v_{2}$ ,$y_{2}=v_{3}$ and $y_{3}=v_{1}$ in three steps. (b) The assignment $\\\\left(y_{1},y_{2},y_{3}\\\\right)=\\\\left(v_{2},v_{3},v_{1}\\\\right)$ corresponds to path $s\\\\;{\\\\xrightarrow{v_{2}}}\\\\;u_{1}\\\\;{\\\\xrightarrow{v_{3}}}\\\\;u_{4}\\\\;{\\\\xrightarrow{v_{1}}}\\\\;t$ −→ −→ −→ in the multi-valued decision diagram.  \\n\\n  \\n\\nand $y_{2}=v_{3}$ in Figure 4(b) and arrive at node $u_{4}$ , the only valid option for $y3$ is to set $y_{3}=v_{1}$ . The other options $y_{3}=v_{2}$ or $y_{3}=v_{3}$ lead to constraint violations. Hence CoreSp masks out the choices of $y_{3}=v_{2}$ and $y_{3}=v_{3}$ for the sequence-to-sequence model. In this way, Core-Sp addresses a key limitation of structured prediction models. We next provide the details of Core-Sp .', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633414841804, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 5, 'chunk_text': \"# 3.1 Embed Constraint Reasoning in Structured Prediction\\nOur proposed Core-Sp framework creates an additional layer to a sequence-to-sequence model to enforce constraint satisfaction for structured problems. It can be integrated into various structured prediction neural networks for different tasks. In this paper, we demonstrate the Core-Sp layer on the sequence-to-sequence structured prediction network. Core-Sp works by masking out the output that violates constraints, thereby providing correctness guarantees. Following the discussions of Section 2.1, the sequence-tosequence structured prediction neural network receives input $\\\\mathit{x}\\\\ =\\\\ \\\\left({{x}_{1}},{{x}_{2}},...\\\\,,{{x}_{T}}\\\\right)$ and $z\\\\;=\\\\;(z_{1},z_{2},\\\\ldots,z_{T})$ in sequential format and outputs $y\\\\;=\\\\;(y_{1},y_{2},\\\\ldots,y_{T})$ .In the $k$ -th step, the score vector $o_{k}=(o_{k1},o_{k2},\\\\ldots,o_{k D_{k}})$ is produced by the sequence-to-sequence network, where $O_{k j}$ represents the un-normalized likelihood that $y_{k}$ takes the value $v_{j}$ . Vector $p_{k}=(p_{k1},p_{k2},...\\\\,,p_{k D_{k}})$ is the result after normalizing $o_{k}$ , where $p_{k j}$ is the probability for variable $y_{k}$ to take the value $v_{j}$ . Without the addition of the Core-Sp layer, the probability $p_{k j}$ may be assigned a positive value for certain variable assignments $y_{k}=v_{j}$ that lead to a constraint violation.  \\n\\nThe Core-Sp module enforces constraints by masking out certain entries $p_{k j}$ of the vector $p_{k}$ whose associated assignment $y_{k}\\\\,=\\\\,v_{j}$ is not allowed by the MDD. It does this by tracking a ‘pivot node’ in the MDD. Initially, the pivot node starts at the source node, and it descends along a path determined by the output of Core-Sp in a sequential way. In the example in Figure 4, the pivot node starts at node $s$ , descends along nodes $u_{1}$ ,$u_{4}$ ,and arrives at $t$ , following the output of the sequence-to-sequence model. In each step, Core-Sp maintains a mask vector $c_{k}\\\\;=\\\\;\\\\left(c_{k1},c_{k2},.\\\\perp.\\\\right)c_{k D_{k}})$ based on the current pivot node. Vector $c_{k}$ is used to mask out entries in $p_{k}$ that will lead to constraint violation. If there is no path labeled with $v_{j}$ leaving the current pivot node, $c_{k j}$ is set to 0. Otherwise, $c_{k j}$ is set to 1. Suppose the pivot node is at $u_{1}$ in the example shown in Figure 4, $c_{22}$ is set to $0$ , and $c_{21},c_{23}$ are set to 1 because the two outgoing edges from $u_{1}$ are labeled with $v_{1}$ and $v_{3}$ . The next step of Core-Sp is the element-wise multiplication of $p_{k}$ and $c_{k}$ , resulting in $p_{k}^{\\\\prime}$ . Denote $\\\\odot$ as the element-wise vector-vector product, the masking step is computed as $p_{k}^{\\\\prime}\\\\,=\\\\,p_{k}\\\\odot c_{k}$ ⊙.Those entries that lead to constraint violation in $p_{k}^{\\\\prime}$ are zeroed out. To make sure that the probabilities sum up to 1, $p_{k}^{\\\\prime}$ further goes through a re-normalization step. The re-normalized probability vector is computed as: ˜ Finally, $z_{k}$ is sampled uniformly at random from $\\\\mathcal{U}(0,1)$ and the output $y_{k}$ is decided based $\\\\begin{array}{r}{\\\\tilde{p}_{k j}=\\\\frac{p_{k j}^{\\\\prime}}{\\\\sum_{j^{\\\\prime}}p_{k j^{\\\\prime}}^{\\\\prime}}}\\\\end{array}$ P.on the cumulative probabilities $P_{k1},P_{k2},\\\\ldots,P_{k(D_{k}+1)}$ computed from $\\\\tilde{p}_{k}$ :$P_{k1}\\\\,=\\\\,0$ , and $\\\\begin{array}{r}{P_{k j}=\\\\sum_{j^{\\\\prime}=1}^{j-1}\\\\tilde{p}_{k j^{\\\\prime}}}\\\\end{array}$ only if z$z_{k}\\\\,\\\\in\\\\,\\\\left[P_{k j},P_{k(j+1)}\\\\right)$ ∈', for $j=2,3,\\\\dots,D_{k}$ \\x01. Denote assignment indicator vector and $P_{k(D_{k}+1)}=1$ .$y_{k}$ is set to the value of $q_{k}\\\\,=\\\\,(q_{k1},q_{k2},\\\\ldots,q_{k D_{k}})$ $v_{j}$ if and ,where q$q_{k j}$ is an indicator variable for $y_{k}=v_{j}$ . This implies $q_{k j}$ is $1$ if and only if $y_{k}=v_{j}$ ,otherwise $q_{k j}\\\\,=\\\\,0$ . After setting the value of $y k$ , the pivot node descends to a new node along the corresponding arc in the MDD. To conclude, the computational pipeline at the $k$ -th step is reflected in the following equations:  \\n\\n$$\\n\\\\begin{array}{r l}&{p_{k j}=\\\\frac{\\\\exp(\\\\rho_{k j})}{\\\\sum_{j=1}^{D_{k}}\\\\exp(\\\\rho_{k j})},}\\\\\\\\ &{p_{k}^{\\\\prime}=p_{k}\\\\odot r_{k},}\\\\\\\\ &{\\\\tilde{p}_{k j}=\\\\frac{p_{k j}^{\\\\prime}}{\\\\sum_{j=1}^{D_{k}}p_{k j}^{\\\\prime}},}\\\\\\\\ &{P_{k j}=\\\\left\\\\{\\\\begin{array}{l l}{0}&{\\\\mathrm{for~}j=1,}\\\\\\\\ {\\\\sum_{j=1}^{j-1}\\\\tilde{p}_{k j},}&{\\\\mathrm{for~}j=2,3,\\\\dots,D_{k},}\\\\\\\\ {0}&{\\\\mathrm{for~}j=D_{k}+1,}\\\\end{array}\\\\right.}\\\\\\\\ &{q_{k j}=\\\\left\\\\{\\\\begin{array}{l l}{1}&{\\\\mathrm{if~}z_{k}\\\\in\\\\left[P_{k j},P_{k(i+1)}\\\\right),}\\\\\\\\ {0}&{\\\\mathrm{otherwise.}}\\\\end{array}\\\\right.}\\\\\\\\ &{y_{k}=v_{j},\\\\qquad\\\\mathrm{if~}q_{k j}=1,\\\\ \\\\mathrm{for~}j\\\\in J_{k},}\\\\end{array}\\n$$  \\n\\nHere $\\\\odot$ denotes the element-wise product between two vectors and $z_{k}\\\\sim\\\\mathcal{U}(0,1)$ . We illustrate how Core-Sp works using the following Example 3.  \\n\\nExample 3 We illustrate the procedure of Core-Sp using the example in Figure 5. Initially, the pivot node for tracking the MDD is the root node $s$ . The first step is to set the value for the first variable $y_{1}$ . Here, the neural network outputs an un-normalized likelihood vector $o_{1}=(0.1,0.2,0.3)$ ity vector $\\\\begin{array}{r}{p_{1}\\\\,=\\\\,\\\\left(\\\\frac{\\\\exp(0.1)}{\\\\exp(0.1)+\\\\exp(0.2)+\\\\exp(0.3)},\\\\frac{\\\\exp(0.2)}{\\\\exp(0.1)+\\\\exp(0.2)+\\\\exp(0.3)}\\\\right.}\\\\end{array}$ \\x10. The next softmax layer receives $O1$ and outputs the normalized probabil,$\\\\begin{array}{r}{\\\\frac{\\\\exp(0.3)}{\\\\exp(0.1)+\\\\exp(0.2)+\\\\exp(0.3)}\\\\approx}\\\\end{array}$ \\x11(0 .30 ,0 .33 ,0 .37) . From the MDD on the right-hand side, $y_{1}$ has only two valid assignments, $v_{2}$ or v3 . Therefore, Core-Sp produces a mask vector $c_{1}=(0,1,1)$ , which forbids $y_{1}$ taking the value $v_{1}$ . As in Equation (5) , multiplying $p_{1}$ with $c_{1}$ elementwisely gives us an unnormalized probability vector $p_{1}^{\\\\prime}\\\\,=\\\\,(0,0.33,0.37)$ . After the re-normalization operation in Equation (6) , we obtain $\\\\begin{array}{r}{\\\\tilde{p}_{1}\\\\,=\\\\,(0,\\\\frac{0.33}{0.33+0.37},\\\\frac{0.37}{0.33+0.37})\\\\,\\\\approx\\\\,(0,0.47,0.53)}\\\\end{array}$ . According to Equation (7) , the cumulative probability vector would be: $P_{1}=(0,0,0.47,1)$ . We then uniformly sample $z_{1}$ at random between $\\\\boldsymbol{\\\\theta}$ and $\\\\mathit{1}$ . In this example, $z_{1}=0.4\\\\in[0,0.47)$ , hence we get vector $q_{1}\\\\,=\\\\,(0,1,0)$ and set $y_{1}\\\\,=\\\\,v_{2}$ , that corresponds to Equations (8) and (9) . After setting y1 ’s value, Core-Sp sets the pivot node to $u_{1}$ following the arc $e(s,u_{1})\\\\,=\\\\,v_{2}$ .${\\\\mathit{I t}}$ continues the same process of setting values for y2 and $y_{3}$ . This example sets $y_{2}$ to $v_{3}$ and $y_{3}$ to $v_{1}$ , which corresponds to the blue path in the decision diagram on the right-hand side.  \\n\\n  \\nFigure 5: Architecture of embedding Core-Sp into a sequence-to-sequence model for the decision variables $y_{1},y_{2},y_{3}$ , where the highlighted Core-Sp module encodes the exact MDD in Figure 2(a). Core-Sp descends layer-by-layer in the MDD. Initially, the pivot node of Core-Sp is at root $s$ . The node $s$ limits the value of $y_{1}$ to be $y_{1}\\\\in\\\\mathsf{v a l}(s)=\\\\{v_{2},v_{3}\\\\}$ . If t ks $y_{1}=v_{2}$ , then the pivot node moves to node $u_{1}$ following the arc e$e(s,u_{1})\\\\,=\\\\,v_{2}$ . Next, the node $\\\\boldsymbol{u}_{1}$ limits the value of $y_{2}$ to be $y_{2}\\\\in{\\\\tt v a l}(u_{1})=\\\\{v_{1},v_{3}\\\\}$ . If the neur $y_{2}=v_{3}$ , then the pivot node shifts to node $u_{4}$ following the arc e$e(u_{1},u_{4})\\\\,=\\\\,v_{3}$ . Finally, the pivot node descends to $u_{4}$ following the single outgoing arc: $e(u_{4},t)=v_{1}$ . Hence the assignment for variable $y_{3}$ becomes $y_{3}=v_{1}$ .  \\n\\nJiang, Zhang, Hoeve and Xue  \\n\\nProposition 1 Let $\\\\mathcal{M}$ be a non-empty exact MDD that is compiled from the constraint set $\\\\mathcal{C}$ .The sequence-to-sequence model with the addition of Core-Sp is guaranteed to generate structured outputs satisfying all constraints in $\\\\mathcal{C}$ .  \\n\\nProof Because $\\\\mathcal{M}$ is exact, it represents all solutions to $\\\\mathcal{C}$ with respect to the domains of the decision variables. At a specific pivot node $u$ in layer $k$ ,Core-Sp only masks values for $y_{k}$ that do not belong to ${\\\\tt v a l}(u)$ . For all remaining values $\\\\upsilon$ , an edge $e(u,u^{\\\\prime})$ with label $\\\\upsilon$ exists. Moreover, at least one path from $u^{\\\\prime}$ to the terminal node $t$ must exist. Hence, unless $\\\\mathcal{M}$ is empty, the process generates a complete variable assignment satisfying $\\\\mathcal{C}$ .  \\n\\nImplementation. Core-Sp allows for efficient back-propagation of the gradient of the neural network’s parameters. In model training, all computations are differentiable during the gradient backward pass except for the setting of the $q_{k j}$ values. We set $q_{k j}$ to 1 if and only if $z_{k}\\\\,\\\\in\\\\,[P_{k j},P_{k(j+1)})$ .In other words, the value for $q_{k j}$ is determined by $q_{k j}\\\\,=\\\\,\\\\mathbb{1}\\\\{z_{k}\\\\,\\\\ge\\\\,P_{k j}\\\\}\\\\mathbb{1}\\\\{z_{k}\\\\,<\\\\,P_{k(j+1)}\\\\}$ . When computing $\\\\frac{\\\\partial q_{k j}}{\\\\partial P_{k j}}$ , we use the sigmoid function with a huge constant to replace the indicator function $\\\\mathbb{I}\\\\{\\\\cdot\\\\}$ . This operation allows for gradient propagation and improves the numeric stability of gradient computation, and avoids producing NaN or Infinity gradients. For the cases where the loss function can be directly defined on $\\\\tilde{p}_{k}\\\\,=\\\\,(\\\\tilde{p}_{k1},\\\\tilde{p}_{k2},\\\\dots,\\\\tilde{p}_{k D_{k}})$ , such as the cross-entropy loss, we do not use $\\\\mathcal{Z}$ variables to sample variables $y_{k}$ during training. The $\\\\mathcal{Z}$ variables are used during testing, as will be detailed for the applications in Section 5. The MDD inside Core-Sp is implemented with two key-value dictionaries. One dictionary memorizes all the mask vectors. It uses the nodes in the MDD as keys and returns the corresponding mask vectors. The other dictionary saves the connectivity of the MDD, it uses the current node in the MDD as the key and returns all of its following nodes in the next layer. This dictionary allows for the pivot node to descend along the path and is also used for the node split and arc filtering procedure discussed in Section 4.\", 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633449444814, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 6, 'chunk_text': '# 3.2 Connection to Existing Works\\nThere are several existing works that also use reasoning tools to enforce constraints in neural network-based models. OptNet (Amos and Kolter, 2017) and MIPaaL (Ferber et al., 2020) propose to encoding quadratic programming (QP) or mixed integer programming (MIP) to enforce constraints, these methods backpropagate the gradients through their optimality conditions. Both approaches require solving a linear programming problem (or MIP problem) in the forward pass. In contrast, our approach pre-computes the feasible set and therefore can be integrated “as is” into the neural net. No LP or MIP solver is needed.  \\n\\nAnother line of work imposes sparsity on the structured output. The sparseMAP method (Niculae et al., 2018) models the probability distribution using a combination of a few sparse structured outputs. Their sparsity assumption implicitly enforces constraints by assigning invalid solutions zero probabilities. Because their overall formulation needs to be convex, the types of combinatorial constraints they can handle are limited. The authors generalize their approaches to handle more general logic constraints in their follow-up work LP-sparseMAP (Niculae and Martins, 2020). Their approach is to decompose the problem in the factor graph, and uses the alternating direction method of multipliers (ADMM) to enforce the consistent value assignments towards variables. This approach indeed provides a good way to handle constraints in structured prediction. However, ADMM only ascends towards the maximum of the dual problem, although the primal-dual gap can be large for non-convex problems. Our approach provides an alternative way to handle constraints beyond problem decomposition and harnessing the primal-dual gap.  \\n\\nDeutsch et al. (2019) propose a strategy to formalize the constraints as automata. During inference, the outputs are generated by walking step-by-step in the automata. Compared to this work, the MDD we use is similar to the automata since both of them only use valid paths as valid solutions. However, we enforce Core-Sp during both learning and inference stages while their space-optimized automata can only be applied in inference. We will show in Section 6 that constraint satisfaction during learning actually leads to improvement in learning performance because of the reduced modeling space. In addition, in Section 4 we propose a relaxed search algorithm for MDD structures to automatically find the sweet spot that balances model complexity and learning performance.\\n\\n# 4. Searching for the Optimal CORE-SP Structure\\nThe exact MDDs for real-world problems could be arbitrarily large, so the exact MDD may consume too much memory overhead. Because of the large space complexity of exact MDDs, it is not practical to deploy the Core-Sp with the exact MDD on several real-world problems. Also, a large MDD implicitly implies a complex output space, which requires more data to learn an accurate model. In problems where exact MDDs are not practical, relaxed MDDs can be used to reduce the memory requirement. In this section, we explore the trade-offbetween space complexity and learning performance by exploring the usage of relaxed MDDs.  \\n\\nTo find the optimal MDD structure which balances memory consumption and learning performance, we propose an iterative search procedure, presented in Algorithm 1. We tune the width parameter to find a relaxed Core-Sp that achieves the optimal performance. The algorithm starts increasing the width from 1 to the given hyper-parameter maximum layer with $\\\\omega_{\\\\mathrm{max}}$ , iteratively learning Core-Sp model with new MDD model $\\\\mathcal{M}$ on the training set, validating their learning performance on a separated validation data set until finding a good MDD structure. The inputs to Algorithm 1 are training and validation data sets $(D^{t r},D^{v a l})$ , parameters of the sequence-to-sequence neural network $\\\\theta$ , a set of constraints $\\\\mathcal{C}$ and the maximum width $\\\\omega_{\\\\mathrm{max}}$ of MDD. In the beginning, the initMDD function initializes a width-1 MDD. At every iteration, the train function trains the neural network with the constraints enforced in the relaxed MDD via gradient descent on the training data set. This is detailed in Section 3.1. Then the validation function evaluates the performance on a separate validation data set. In line $6\\\\!-\\\\!9$ of Algorithm 1, we evaluate if the current MDD has a better performance than the previous one. If the loss on the validation set is decreasing, the algorithm would continue the relaxation; otherwise, the algorithm terminates and returns the current Core-Sp as well as the learned parameters.  \\n\\nFor the relaxMDD procedure, it takes a relaxed MDD as input and relaxes all its layers from top to bottom to the given width. For each layer, the algorithm repeatedly picks a node and then splits it into two nodes, which corresponds to the nodeSplit function until the width of the layer reaches the given width. Every node split is followed by an arc filtering process, denoted by arcFilter , to enforce constraints. Hoda et al. (2010) present MDD arc filtering procedures for various constraint types. Figure 3 gives an example of the nodeSplit and arcFilter processes on a relaxed MDD.  \\n\\n  \\n\\nNote that in node splitting (line $14-17$ of Algorithm 1), those nodes with only one incoming arc are skipped for the splitting process, i.e. $u.i n\\\\ge1$ . In our paper, the heuristic for splitting is: the set of incoming arcs (noted as .in ) of the original node is randomly assigned to the two newly created nodes $(v,w)$ and then the outgoing arcs (noted as .out )are copied to nodes $v,w$ . There are other heuristic methods for node splitting. We refer the readers to Bergman et al. (2016a) for details. The arcFilter function is applied to remove those outgoing arcs that lead to constraint violations.  \\n\\nLimitations. While the advantages of Core-Sp will be demonstrated by realistic applications in Section 5, the framework has several limitations. First, the current constraint reasoning module based on MDDs cannot enforce continuous-valued constraints. Indeed, it is possible to apply discretization to transform continuous constraints into discrete ones and then use Core-Sp .However, such discretization may result in very large decision diagrams. Second, the decision diagram is based on a sequence-to-sequence structured prediction model. Other encoding-decoding structures, for example, encoding-decoding on a graph, may require exploring other types of structured prediction models. We leave the study of these limitations as future works.', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633484047824, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 7, 'chunk_text': '# 5.1 Vehicle Dispatching Service Planning\\nTask Definition. Consider a routing problem in which one needs to dispatch a service vehicle to perform maintenance at a set of locations. The sets of locations differ per day and are rarely the same. Previous routes indicate that the driver does not follow a clear objective, such as minimizing the distance or time. Instead, historical data suggest that the driver has an underlying route preference, such as visiting a shopping mall after leaving a restaurant. Our task is: given the historic routes and a set of requested locations, determine a path that visits all the locations once and only once while capturing the hidden trends embedded in the historical data. To be specific, given a request to visit a set of locations $x=\\\\{x_{1},x_{2},...\\\\,,x_{T_{i}}\\\\}$ in the $i$ -th day, determine $y=(y_{1},y_{2},...,y_{T_{i}})$ , which forms a permutation of $x$ and captures the driver’s preferences. For this application, we assume an upper bound $T$ on the number of sites to visit per day. In other words, for all $i$ ,$T_{i}\\\\leq T$ .  \\n\\nTraditional optimization methods such as integer programming or constraint programming do not work well in this context since they are unable to represent an appropriate objective function for the latent route preference (Braekers et al., 2016). Machine learning models on the other hand can be used to learn the underlying pattern from the historical routes (Junior Mele et al., 2021). Nevertheless, the routes generated from pure machine learning models cannot satisfy key operational constraints. They may visit some locations multiple times, or fail to visit all locations. Post-processing steps, such as removing redundant locations and randomly appending unvisited locations, have been tried to fix the output of machine learning models (Deudon et al., 2018). However, their performance is limited in our experiments (see Section 6.1 for details).  \\n\\nDefinition of Constraints. The input of this application is a set of locations to visit for day $i$ :$x=\\\\{x_{1},x_{2},\\\\dots,x_{T_{i}}\\\\}$ . The goal is to generate a schedule $y$ to represent the order of visiting the locations, where $y$ is a permutation of $x$ . The route $y$ needs to satisfy the following constraints:  \\n\\n•full-cover constraint. The delivery route should visit all and only the locations in $x$ . In other words, the set of locations in $y$ is the same as the set in $x$ .•all-diff constraint. The route should not visit one place twice. In other words, $y_{j}\\\\neq y_{k}$ for all $y_{j},y_{k}\\\\in y$ and $j\\\\neq k$ .  \\n\\nWe note that our MDDs can potentially incorporate other constraints such as time windows or precedence constraints (Cir´e and van Hoeve, 2013).  \\n\\n  \\nFigure 6: The MDDs for the vehicle dispatching service planning task. (a) An exact MDD that models the visit to “Hof”, or “Haar”, or both of them. All arcs of solid lines are of the first type and arcs of dashed lines are of the second type, which directs the delivery agent to the stop location $t$ .(b) A width-1 relaxed MDD, which is formed by combining nodes $u_{1}$ and $u_{2}$ of the exact MDD.  \\n\\nMDD Construction. The delivery routes have at most $T$ locations in the data set, so the MDD graph $\\\\mathcal{M}$ would contain $T+2$ layers. There is a single source node $s$ in the first layer and a single sink node $t$ at the last layer. There are two types of arcs in the MDD. For the first type, an arc $e(u,u^{\\\\prime})$ with label $v_{i}$ (where $u^{\\\\prime}\\\\neq t$ ) in the $j$ -th layer represents that we visit $v_{i}$ as the $j$ -th location in the schedule. The second type of arcs $e(u,t)$ with label $t$ connects every node to the sink node $t$ , allowing the delivery agent to travel to the ending location at any time.  \\n\\nFigure 6(a) shows an example MDD. Here, the delivery agent needs to visit “Hof” or “Haar” (two towns in Bavaria) or both of them. The arcs of the first type are shown in solid lines and the arcs of the second type are shown in dashed lines. The two arcs $e(s,u_{1})=$ $H o f,e(s,u_{2})=H a a r$ leaving the root node $s$ denote the first location that the delivery agent visit, which can be either “Hof” or “Haar”. The three arcs $e(u_{1},t)=e(u_{2},t)=e(u_{3},t)=t$ are of second type. The purpose is to bring the delivery agent to the ending location $t$ . In practice, an MDD that represents all valid paths can be of exponential size with respect to the number of maximum locations. Figure 6(b) shows a width-1 relaxed MDD, formed by combining nodes $u_{1}$ and $u_{2}$ in Figure 6(a). The paths in relaxed MDDs form a superset of all valid paths. As we can see, “Hof $\\\\rightarrow\\\\mathrm{Hof}\\\\rightarrow\\\\mathrm{t^{\\\\ast}}$ is a path in the relaxed MDD, but it violates the all -diff constraint.  \\n\\nMDD Arc Filtering. The previously constructed MDD contains routes up to length $T$ considering all possible locations. To enforce the all -diff constraint, we apply the arc filtering rules from Cir´e and van Hoeve (2013). To enforce the full-cover constraint for a specific day with request $x$ , we apply the following steps:  \\n\\n1. Remove all arcs whose label does not belong to $x$ (these are arcs of the first type). This ensures that only locations in $x$ are considered.  \\n\\n2. Remove all arcs with label $t$ , except for arcs $e(u,t)$ where $u$ belongs to layer $|x|+1$ .Remove all nodes and associated arcs from the layers between layer $|x|+1$ and the last layer. This ensures that the routes in the MDD have the appropriate length.  \\n\\n3. Remove all nodes and arcs that do not belong to an $s$ -$t$ path.  \\n\\nThe first two steps are implemented in a top-down pass of the MDD, while the last step requires an additional bottom-up pass. The total time complexity is therefore linear in the size of the MDD.  \\n\\nModel Structure. We employ the Core-Sp module on a conditional Generative Adversarial Network (cGAN) to generate routes that capture the implicit preferences of the drivers while preserving the operational constraints. In the generative adversarial structure, the generator network $G$ is trained to generate routes to mimic the pattern in the training data set. The discriminator network $D$ is trained to separate the generated routes from the actual ones in the training data set. When training converges, the discriminator should not be able to tell the difference between the true outputs and the structures generated by the generator. In return, the generator generates structures that closely look like the ones in the data set. The Core-Sp module is embedded in the generator and filters out those routes that violate the operational constraints. As a result, the generated routes would satisfy all operational constraints. We employ the conditional GAN model structure because the element-wise loss function is not ideal to measure the distance between the predicted route and the ground truth route. Namely, suppose one route $\\\\left(y_{T},y_{T-1},\\\\ldots,y_{1}\\\\right)$ is the reverse of the optimal route $(y_{1},y_{2},\\\\ldots,y_{T})$ . Both of them may be equally good to fit the delivery constraints as well as the driver’s underlying preference. However, an element-wise loss function penalizes the shifted route heavily because it is different from the optimal route in every location.  \\n\\nThe overall conditional GAN with the Core-Sp architecture is shown in Figure 7, which is composed of the generator $G$ and the discriminator $D$ . The generator $G$ takes the set of locations $x$ as input and outputs the un-normalized score vectors $\\\\left(o_{1},o_{2},\\\\ldots\\\\right)$ , where vector $o_{j}$ denotes the un-normalized likelihood of visiting each location at the $j$ -th step. Core-Sp receives these score vectors and the random values $(z_{1},z_{2},\\\\ldots)$ as inputs, and outputs a valid route $(q_{1},q_{2},\\\\ldots)$ . Here, $q_{j}\\\\,=\\\\,(q_{j1},\\\\ldots,q_{j N})$ is a vector, where $q_{j k}$ is an indicator variable representing whether location $k$ is visited in the $j$ -th step. Finally, the discriminator function $D$ tries to separate the predicted route $(q_{1},q_{2},\\\\cdot\\\\cdot\\\\cdot)$ and the groundtruth route $(y_{1},y_{2},\\\\cdot\\\\cdot\\\\cdot)$ . Here, each $y_{j}\\\\,=\\\\,(y_{j1},\\\\dotsc,y_{j N})$ is again represented as a vector, where $y_{j k}$ indicates whether to visit location $k$ in the $j$ -th step. The generator $G$ uses an encoder to learn a representation vector for the input and uses a sequential decoder to generate the schedule. In each step, the daily request $x$ is fed as an input vector, where the locations in the requested set are marked as $1$ .$G$ uses the following LSTM structure to generate the schedule:  \\n\\n$$\\n\\\\begin{array}{r}{\\\\boldsymbol{h_{j}}=\\\\mathsf{L S T M}(\\\\boldsymbol{x},\\\\boldsymbol{h_{j-1}}),}\\\\\\\\ {\\\\boldsymbol{o_{j}}=\\\\boldsymbol{W}\\\\boldsymbol{h_{j}},\\\\qquad\\\\qquad}\\\\end{array}\\n$$  \\n\\nwhere the score vector $o_{j}$ represents the likelihood of picking the next location in the $j$ -th step. The score vector $o_{j}$ and the random variable $z_{j}$ are then fed into the Core-Sp module. The Core-Sp module removes invalid locations and produces $q_{j}$ according to the  \\n\\n  \\nFigure 7: The conditional GAN with Core-Sp module for the vehicle dispatching service planning problem. Vector $x$ represents the requested delivery locations in day $i$ .$(\\\\tilde{y}_{1},\\\\tilde{y}_{2},\\\\ldots)$ represents the generated path from Core-Sp , represented using indicator vectors $(q_{1},q_{2},\\\\ldots)$ . The generator $G$ takes the set of locations $x$ as input and uses a sequential encoder to learn a representational vector. Then it outputs a sequence of score vector $\\\\left(o_{1},o_{2},\\\\ldots\\\\right)$ using a sequential decoder, where $o_{j}$ denotes the likelihood of picking the next locations at the $j$ -th step. The CoreSp module removes invalid locations. The Discriminator $D$ is used to separate the real path $(y_{1},y_{2},\\\\ldots)$ from the generated path $(\\\\tilde{y}_{1},\\\\tilde{y}_{2},\\\\ldots)$ .  \\n\\nrandom value of $z_{j}$ . The detailed equations of Core-Sp were presented in Section 3.1. The discriminator $D$ is trained to separate the generated schedule $q=(q_{1},q_{2},.\\\\ldots,q_{T_{i}})$ from the real schedule $y=(y_{1},y_{2},...,y_{T_{i}})$ . It uses the following LSTM structure:  \\n\\n$$\\n\\\\begin{array}{r}{\\\\tilde{s}_{j}=\\\\mathtt{L S T M}(q_{j},\\\\tilde{s}_{j-1}),}\\\\\\\\ {s_{j}=\\\\mathtt{L S T M}(y_{j},s_{j-1}),}\\\\end{array}\\n$$  \\n\\nwhere $\\\\tilde{s}_{j}$ denotes the hidden vector after encoding the first $j$ generated locations and $s_{j}$ denotes the hidden vector after encoding the first $j$ locations in the real data. The output of the discriminator $D$ is $\\\\sigma(U s)$ , where $U$ is a linear transformation matrix and $s$ is either $s_{T_{i}}$ or $\\\\tilde{s}_{T_{i}}$ . The sigmoid activation function is $\\\\sigma(s)=1/(1+\\\\exp(-s))$ . Overall, $D$ and $G$ are Constraint Reasoning Embedded Structured Prediction', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633517077970, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 8, 'chunk_text': '# Input sentence:\\nBlink light of your Philips Hue when your Amazon Alexa timer hits 0.\\n\\n# Output labels:\\n<html><body><table><tr><td>trigger-service ts V</td><td>trigger-function ytf</td><td>action-service as</td><td>action-function yaf</td></tr><tr><td>amazon-alexa</td><td>timer-goes-off</td><td>Philips-hue</td><td>blink-light</td></tr></table></body></html>  \\n\\nFigure 8: An example of if-then program synthesis task. The input is a natural language description of the program. The output are four labels: trigger-service ,trigger-function ,action-service and action-function .The semantics of the synthesized if-then program are: if trigger-function happened at trigger-service , then take action-function at the action-service .  \\n\\ntrained by minimizing the loss function in a competing manner using stochastic gradient descent. The loss function $\\\\mathcal{L}$ is:  \\n\\n$$\\n\\\\begin{array}{r l}{\\\\underset{G}{\\\\operatorname*{min}}\\\\underset{D}{\\\\operatorname*{max}}}&{\\\\mathbb{E}_{x,y}\\\\left[\\\\log D\\\\left(y,x\\\\right)\\\\right]+\\\\mathbb{E}_{z,x,y}\\\\left[\\\\log\\\\left(1-D\\\\left(G\\\\left(x,z\\\\right),y\\\\right)\\\\right)\\\\right].}\\\\end{array}\\n$$\\n\\n# 5.2 If-Then Program Synthesis\\nTask Definition. Many internet applications provide automatic services to meet user needs, including daily weather reports and video streaming services. Connectivity platforms such as IFTTT $^{11}$ and Zapier $^2$ can streamline services from different providers by connecting simple services into more complex ones, in the form of if-then programs. For instance, the smart device Philips Hue can automatically blink lights when commands are sent from a cellphone. As another example, Amazon Alexa can be programmed as a timer via voice commands. Given these two services, users can set up an if-then program on the IFTTT platform for more complicated tasks. For example, an if-then program can command the Philips Hue to blink lights when the timer in Amazon Alexa reaches zero. However, it may take several hours for inexperienced users to learn the IFTTT website’s interface before they can implement the program above. If such if-then programs can be automatically synthesized from natural language to provide suggestions for inexperienced users, this will help to reduce the overhead in using such platforms and boost the users’ efficiency.  \\n\\nWe consider the task of generating if-then programs from natural language as a structured prediction task. In our setup, an if-then program is made up of four components: trigger-service ,trigger-function ,action-service , and action-function . The logic is “if trigger-function happens in the trigger-service , then take the action-function from the action-service ”. Such programs can be represented using this pseudo-code:  \\n\\nIF trigger-service.trigger-function THEN action-service.action-function  \\n\\nJiang, Zhang, Hoeve and Xue  \\n\\nFigure 8 shows an example of this task. We would like to transform a user’s text description: “Blink light of the Philips Hue when the Amazon Alexa timer hits $\\\\boldsymbol{\\\\theta}$ ”into the following ifthen program:', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633550894548, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 9, 'chunk_text': '# IF Alexa.timer-go-offTHENHue.blink-light\\nThe challenge in if-then program synthesis is to enforce the constraints between the services and the associated functions. Without enforcing constraints, the output of the structured prediction model may be invalid. For instance, the model can predict “Hue” for trigger-service , but perhaps assigns “report rain” to the trigger-function , while we know before training that the smart device “Hue” does not provide any weather reporting services.  \\n\\nDefinition of Constraints. We introduce the Functionality constraint for the if-then program synthesis as follows. Let $s$ be a service. We define a mapping $F(s)$ to be the set of functions that can be associated with $s$ . For example, if $s$ is “weather service”, then the output of $F(s)$ is a set that contains functions such as “hourly report”, “tomorrow forecast”, “severe weather alarms”, etc. The Functionality constraints for all the four components are:  \\n\\n$$\\n\\\\mathbf{trigger-service}=s\\\\Rightarrow\\\\mathbf{trigger-function}\\\\in F(s),\\n$$  \\n\\nFor each service $s$ , the mapping $F(s)$ is collected from the associated reference page and provided as prior information.  \\n\\nMDD Construction. The Functionality constraints can be represented using an MDD with five layers, where the first layer has one source node $s$ and the last layer has one sink node $t$ . Each arc between the first and second layer corresponds to a value assignment to the variable trigger-service . Each arc between the second and third, third and fourth, and fourth and fifth layers corresponds to a value assignment to variable trigger-function ,action-service , and action-function , respectively. Figures 9(a) and (b) represent a width-1 and width-2 MDD for if-then program synthesis.  \\n\\nIn the MDD, multiple arcs from the first layer can be connected to a single node $u$ in the second layer. The node $u$ hence represents the set of if-then programs that have a given subset of trigger services. The set of arcs leaving from $u$ represents the union of all the trigger functions that are associated with the trigger services connecting $u$ . For example, Figure 9(a) demonstrates that both Alexa and Youtube can be associated with the streaming and timer services. Notice that this is a relaxed MDD. In practice, only Alexa has the timer service and only Youtube has the streaming service. Similar semantic meaning holds for the arcs between the 3rd and 4th layers, representing action services and action functions.  \\n\\nThe width of the MDD can be expanded to enforce constraints more precisely. Figure ${\\\\mathfrak{g}}({\\\\mathrm{b}})$ shows an example of the width expansion of MDD. Here, the nodes $u_{1}$ in (a) is split into two nodes $\\\\tilde{u}_{1}$ ,$\\\\hat{u}_{1}$ in (b). After arc filtering, $\\\\tilde{u}_{1}$ is connected to “timer” only while $\\\\hat{u}_{1}$ is connected to “streaming” only because only “Alexa” has the service “timer” and only “Youtube” has the service “streaming”. Similar node splitting and arc filtering are applied for action services and functions as well. In a nutshell, the relaxed MDD can be expanded into an exact one using repeated node splitting and arc filtering.  \\n\\n  \\nFigure 9: Examples of a relaxed and an exact MDD for the if-then program synthesis task. The exact MDD in (b) models the constraints that only the timer service is provided by Alexa, and only the streaming service is provided by Youtube. Similarly, only Hue provides the light service, and only Twitter provides the tweet service. (a) is a relaxed MDD, where both trigger services provide both trigger functions, and both action services provide both action functions.  \\n\\nModel Structure. We employ Core-Sp on the LatentAttention model proposed by Liu et al. (2016), which achieved the state-of-the-art result on if-then program synthesis. The LatentAttention model is a bidirectional LSTM with residual connection, followed by the self-attention mechanism. To be specific, the bidirectional LSTM (Bi-LSTM) encodes a natural sentence input of length $T$ :${\\\\boldsymbol{x}}=(x_{1},x_{2},\\\\ldots,x_{T})$ into $T$ latent vectors $(h_{1},h_{2},\\\\ldots,h_{T})$ .Here, $x_{j}$ is a one-hot vector representing the $j$ -th word in the sentence. Each vector $h_{j}$ is a concatenation of a forward vector $\\\\overrightarrow{h_{j}}$ and a backward vector $\\\\overleftarrow{h_{j}}$ . Suppose vector $\\\\overrightarrow{h_{j}}$ and $\\\\overleftarrow{h_{j}}$ are of length ${\\\\boldsymbol{\\\\mathit{m}}}$ , vector $h_{j}$ will be of length $2m$ . The forward vector $\\\\overrightarrow{h_{j}}$ is the result of encoding input words $x_{1},x_{2},\\\\ldots,x_{j}$ from the left through an LSTM, and the backward vector $\\\\overleftarrow{h_{j}}$ is the result of encoding input words $x_{T},\\\\ldots,x_{T-j}$ from the right. More precisely, in mathematical form:  \\n\\n$$\\n\\\\begin{array}{r l}&{\\\\overrightarrow{h_{j}}=\\\\mathtt{L S T M}(x_{j},\\\\overrightarrow{h_{j-1}}),}\\\\\\\\ &{\\\\overleftarrow{h_{j}}=\\\\mathtt{L S T M}(x_{T-j+1},\\\\overleftarrow{h_{j+1}}),}\\\\\\\\ &{h_{j}=\\\\left[\\\\overrightarrow{h_{j}};\\\\overleftarrow{h_{j}}\\\\right].}\\\\end{array}\\n$$  \\n\\nwhere $[\\\\,;\\\\,]$ denotes concatenation of two vectors. The detailed equations for the LSTM neural network with a residual connection can be found in Greffet al. (2017). In the second step, we encode the sequence of vectors $(h_{1},h_{2},\\\\ldots,h_{T})$ into a single vector $g$ through an attention  \\n\\n  \\nFigure 10: Model structure of If-then program synthesis. Text input $x_{1},x_{2},\\\\ldots,x_{T}$ is fed into bi-directional LSTM with self attention mechanism. The un-normalized likelihood vectors $o^{\\\\mathrm{ts}},o^{\\\\mathrm{tf}},o^{\\\\mathrm{as}},o^{\\\\mathrm{af}}$ are fed into the Core-Sp module for constraint satisfaction.  \\n\\nmechanism, which is similar to Shaw et al. (2018):  \\n\\n$$\\n\\\\begin{array}{r l}&{\\\\alpha_{j k}=\\\\displaystyle\\\\frac{\\\\exp({h_{k}}^{\\\\top}{h_{j}})}{\\\\sum_{k^{\\\\prime}=1}^{T}\\\\exp({h_{k^{\\\\prime}}}^{\\\\top}{h_{j}})},\\\\quad\\\\mathrm{~for~all~}j,k\\\\in\\\\{1,\\\\ldots,T\\\\},}\\\\\\\\ &{g=\\\\displaystyle\\\\sum_{j=1}^{T}\\\\sum_{k=1}^{T}\\\\alpha_{j k}h_{k}.}\\\\end{array}\\n$$  \\n\\nThis Bi-LSTM with attention neural network structure encodes the entire sentence into one latent vector $g$ of size $2m$ .Let $U^{\\\\mathrm{ts}}$ be a matrix of size $|\\\\#\\\\mathrm{service}|\\\\,\\\\times\\\\,2m$ , w eervice |is the number of trigger-service options. We similarly define matrices U$U^{\\\\mathrm{tf}}$ ,$U^{\\\\mathrm{as}},\\\\ U^{\\\\mathrm{af}}$ for trigger-function ,action-service ,action-function respectively with their corresponding shapes. The un-normalized likelihoods for trigger -service $o^{\\\\mathrm{ts}}$ , for trigger-function $o^{\\\\mathrm{tf}}$ , for action -service $o^{\\\\mathrm{as}}$ , and for action -function $o^{\\\\mathrm{af}}$ are defined as  \\n\\n$$\\no^{\\\\mathrm{ts}}=U^{\\\\mathrm{ts}}g,\\\\quad o^{\\\\mathrm{tf}}=U^{\\\\mathrm{tf}}g,\\\\quad o^{\\\\mathrm{as}}=U^{\\\\mathrm{as}}g,\\\\quad o^{\\\\mathrm{af}}=U^{\\\\mathrm{af}}g.\\n$$  \\n\\nThe vectors $o^{\\\\mathrm{ts}},o^{\\\\mathrm{tf}},o^{\\\\mathrm{as}},o^{\\\\mathrm{af}}$ are fed into the Core-Sp module. During training, we use crossentropy loss as the loss function $\\\\mathcal{L}$ that minimizes the difference between the ground-truth prediction and the probabilities $\\\\tilde{p}^{\\\\mathrm{ts}}$ ,$\\\\tilde{p}^{\\\\mathrm{tf}}$ ,$\\\\tilde{p}^{\\\\mathrm{as}}$ ,$\\\\tilde{p}^{\\\\mathrm{af}}$ produced from Core-Sp (definition of these probabilities are in Equation 6). This training procedure is similar to the teacherforcing approach used in Sutskever et al. (2014) to accelerate the learning speed. Variables $z$ are used to sample particular ( trigger -service ,trigger -function ,action -service ,action -function ) quadruples from the probability distribution given by $\\\\tilde{p}^{\\\\mathrm{ts}}$ ,$\\\\tilde{p}^{\\\\mathrm{tf}}$ ,$\\\\tilde{p}^{\\\\mathrm{as}}$ ,$\\\\tilde{p}^{\\\\mathrm{{af}}}$ during testing.', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633586283990, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 10, 'chunk_text': '# 5.3 Text2SQL Generation\\nTask Definition. Formatted data such as travel records and stock market transactions are stored in relational databases. Currently, accessing the database requires a data scientist who masters the SQL query language. Our task is to automatically synthesize SQL queries from natural language sentences using machine learning. Compared with the data expert approach, SQL query generation requires deeper reasoning across the structure of the database, the semantics of the structured query language, and the understanding of natural language. As shown in Figure 11, the input of the text2SQL generation is a sentence that describes the query in natural language and the table headers in the relational database. The output is a SQL query with the following structure:  \\n\\nSELECT agg-op sel-col WHERE (cond-col cond-op cond-val) AND ...  \\n\\nHere, SELECT and WHERE are keywords in the SQL language. What we need to predict are: (1) the aggregation operator $\\\\mathsf{a g g-o p}$ , which chooses among the set {empty, COUNT, MIN, MAX, SUM, AVG }; (2) the column name in selection sel-col and (3) the column name in condition cond-col , both of which are chosen from the table headers; (4) the conditional operator cond-op , which is in $\\\\{=,<,>\\\\}$ ; (5) the conditional value cond-val , which is assumed to be a sub-sequence of the given query. Here, one bracket pair () represents one conditional statement. The SQL query may have multiple conditions, which are denoted above by “ ... ”. Figure 11 displays this SQL query:\\n\\n# SELECT COUNT \"School\" WHERE \"No.\" = \"3\"\\nHere agg-op is COUNT ;sel-col is “school”, which is a column name from the table headers. One cond-col is “No.”, which also comes from the table headers. The cond-op is “=”. The cond-val is “3”, which we assume is from the input query. This example has one condition but multiple conditions are allowed.  \\n\\nDefinition of Constraints. Existing generative neural models for this task are not guaranteed to generate a query that follows the grammar of a SQL query. To avoid grammar violations, we compile a set of common SQL grammars as constraints into the Core-Sp module. The Core-Sp module will ensure that all the generated SQL queries follow the grammatical constraints. Our constraints are defined on the operators, namely the conditional operator cond-op and the aggregation operator agg-op . The domains of these operators are dependent upon the data types of the entities (namely, cond-col and sel-col )they operate on. Consider the previous example. The agg-op can only take values between $\\\\{\\\\mathrm{empty,~\\\\coUNT}\\\\}$ , because the sel-col is “school”, which is of the string type. More precisely, let $s$ be a column header (the value of sel-col or cond-col ). We define $F_{a}(s)$ as  \\n\\nInput Table:   \\n\\n\\n<html><body><table><tr><td></td><td>Player</td><td>No.</td><td>Position</td><td>School</td></tr><tr><td>0</td><td>Antonio</td><td>21</td><td>Guard-Forward</td><td>Duke</td></tr><tr><td>1</td><td>Voshon</td><td>2</td><td>Guard</td><td>Minnesota</td></tr><tr><td>2</td><td>Marin</td><td>3</td><td>Guard-Forward</td><td>Butler CC</td></tr></table></body></html>\\n\\n# Input Query:\\nHow many schools did player number 3 play at?\\n\\n# Output SQL Query:\\nFigure 11: An example for the Text2SQL generation task. The input is the text query “How many schools did player number 3 play at?” and the table header “ Player, No., Position, School ” from the relational database. The output should be the SQL query: SELECT COUNT \"School\" WHERE \"No. $\"~=~\"3\"$ .  \\n\\nthe set of aggregation operators agg-op that can be associated with $s$ , and $F_{c}(s)$ as the set of condition operators cond-op that can be associated with $s$ . That is:  \\n\\n$$\\n\\\\begin{array}{r l}&{F_{a}(s)=\\\\left\\\\{\\\\begin{array}{l l}{\\\\{\\\\mathrm{empty~,~\\\\varsigma0UNT,~\\\\forall\\\\mathrm{IIN},~\\\\forall\\\\mathrm{IAX},~\\\\forall\\\\mathrm{II},~\\\\mathrm{AVG}\\\\}}}&{\\\\mathrm{if~}s\\\\mathrm{~of~is~numeric~type}}\\\\\\\\ {\\\\{\\\\mathrm{empty~,~\\\\varsigma0UNT}\\\\}}&{\\\\mathrm{if~}s\\\\mathrm{~of~is~string~type}}\\\\end{array}\\\\right.}\\\\\\\\ &{F_{c}(s)=\\\\left\\\\{\\\\begin{array}{l l}{\\\\{\\\\mathrm{=,~\\\\displaystyle>,~\\\\varsigma\\\\}}}&{\\\\mathrm{if~}s\\\\mathrm{~is~of~numeric~type}}\\\\\\\\ {\\\\{\\\\mathrm{=}\\\\}}&{\\\\mathrm{if~}s\\\\mathrm{~is~of~string~type}}\\\\end{array}\\\\right.}\\\\end{array}\\n$$  \\n\\nWe also introduce dataype constraints, which are defined as:  \\n\\n$$\\n\\\\begin{array}{r l}&{\\\\mathtt{s e l-c o l}=s\\\\Rightarrow\\\\mathtt{a g g-o p}\\\\in F_{a}(s),}\\\\\\\\ &{\\\\mathtt{c o n d-c o l}=s\\\\Rightarrow\\\\mathtt{c o n d-o p}\\\\in F_{c}(s).}\\\\end{array}\\n$$  \\n\\nModel Structure. We embed the Core-Sp module to SQLova (Hwang et al., 2019), the state-of-the-art neural network for text2SQL generation. SQLova has a sequence-tosequence architecture. It first encodes a natural language sentence and the table headers into a high-dimensional vector. Then the decoder of SQLova decodes the hidden representation into the predictions of various entities in the SQL query. SQLova first determines the number of conditions in the SQL query and then fills in the ( cond-col ,cond-op ,cond-val ) for each condition. The operators agg-op, cond-op are predicted as a classification task from a fixed set of operators. Column names cond-col, sel-col are predicted from the set of table headers in the relational database. The cond-val is predicted by a pointer neural network which points at a span of the input natural language sentence. The selected span of the query is used as the cond-val (Dong and Lapata, 2018).  \\n\\nMDD Construction. The associated MDD that encodes the constraints for text2SQL generation is similar to the MDD for if-then program synthesis. The MDD is split into layers and every two layers form a group. One two-layer group is used to enforce constraints on an operator-column name pair. The operator-column name pair can be $\\\\mathsf{a g g-o p}$ and sel-col ,or can be cond-op and cond-col . Note that there can be only one group of $\\\\mathsf{a g g-o p}$ and sel-col and more than one group of cond-op and cond-col . In the first layer of the group, the column name is determined. In the second layer, the invalid operators are ruled out based on the type of the column name selected in the first layer. The two-layer group is copied several times because the SQL query can contain multiple conditions.  \\n\\nConstraint Reasoning Embedded Structured Prediction', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633618789848, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 11, 'chunk_text': '# 6. Results and Analysis\\nWe demonstrate the effectiveness of the Core-Sp module on the three applications from Section 5. We mainly focus on two metrics: (1) the percentage of valid structures generated; and (2) the learning performance. Metric (1) evaluates whether Core-Sp is able to improve constraint satisfaction for the structures generated by neural network models, while metric (2) considers whether Core-Sp improves the overall performance of neural network models in pattern detection from data. For the task of if-then program synthesis and text2SQL generation, we use accuracy as the metric for learning performance. It measures the percentage that the predicted structures match exactly with the ground-truth structures in the testing set. For the vehicle dispatching service planning, we introduce a quantitative metric that measures how close the generated routes resemble those in the training set. The quantitative metric will be discussed in later text. We also demonstrate the effect of the MDD structures, especially the change of the layer width, on the overall performance of the Core-Sp module.  \\n\\nOur experimental results demonstrate the efficiency of Core-Sp in boosting both the percentages of valid structures generated and the learning performance. In terms of constraint satisfaction, the percentage of valid routes generated increases from $1\\\\%$ to $100\\\\%$ for vehicle dispatching service planning with the embedding of Core-Sp on a conditional GAN model. The percentage of valid programs also increases from about 88% to $100\\\\%$ for if-then program synthesis when Core-Sp is added to the LatentAttention model, and from 83% to $100\\\\%$ for text2SQL generation when Core-Sp is added to SQLNova on a hard test set. Both the LatentAttention and SQLNova are state-of-the-art models for the corresponding tasks. Furthermore, the Core-Sp module also helps improve learning performance. For the if-then program synthesis, the accuracy is $44\\\\%$ for Core-Sp compared to $42\\\\%$ for the LatentAttention model. The neural network also converges to relatively higher accuracy with fewer training epochs. In the text2SQL task, the execution accuracy improves from $76.1\\\\%$ obtained from the state-of-the-art SQLNova model to $78.0\\\\%$ while the logical accuracy improves from $58.3\\\\%$ to $62.5\\\\%$ with the Core-Sp module embedded. The code for all the experiments is available at GitHub.', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633652344282, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 12, 'chunk_text': '# 6.1 Vehicle Dispatching Service Planning\\nOur experiments are on a data set consisting of 29 cities in Bavaria. We vary the number of maximum locations $T$ in the daily requests from 2 to 29 in generating the training and testing sets. We generate $N=10,000$ instances for every given $T$ . The daily requests are randomly sampled from all sets of locations of the specified size. The optimal delivery paths are generated assuming that the delivery agent is maximizing a hidden reward function:  \\n\\n$$\\nR(y)=\\\\sum_{j=1}^{T-1}\\\\tt p r e f\\\\left(y_{j},y_{j+1}\\\\right).\\n$$  \\n\\nHere the scalar value pref $(y_{j},y_{j+1})\\\\,\\\\in\\\\,[0,1]$ is the delivery agent’s implicit preference to visit location $y_{j+1}$ after leaving the location $y_{j}$ . When generating the data set, we enumerate all valid delivery routes and select the one that maximizes this reward function $R$ . Notice that this reward function $R$ was fixed during the data generation step and was hidden to the machine learning algorithms. During the evaluation, the reward function was used as one quantitative measure for the quality of the routes generated. The higher the reward function values, the better the machine learning algorithm was able to capture the hidden preferences of the delivery agent.  \\n\\nFigure 12 (left) presents the memory requirements to represent relaxed MDDs of varying maximum width for routes with lengths $T\\\\,=\\\\,6,7,8,9$ .Figure 12 (right) shows the percentage of valid routes generated during training by Core-Sp with relaxed MDDs of varying maximum widths, for $T\\\\,=\\\\,10$ . For maximum width 128, the relaxed MDD can produce over 80% valid routes, but the large memory usage is prohibitive. In particular, for length $T\\\\,=\\\\,29$ , the exact MDD will exhaust the memory of our machine. Therefore, we also consider an iterative algorithm that incrementally creates the exact MDD alongside the output of the sequential decoder. At step $t$ , we follow the prediction made by the sequential decoder and only expand one step of the MDD starting at the node selected by the sequential decoder. This corresponds to only loading its current outgoing arcs at every predicted step of the exact MDD. Using this idea, we are able to expand the exact MDD even for a larger number of locations ( $T=29$ ).  \\n\\nValid Routes Comparison. We evaluate the performance of Core-Sp in generating valid routes, i.e. , those satisfying the all-diff and full-cover constraints, over data sets of different sizes. As a baseline for comparison, we use the output of the conditional GAN without Core-Sp , to which we include the post-processing method by Deudon et al. (2018). The post-processing method uses a mask vector to enforce that the model can only visit the locations in the daily requested set, and removes all the duplicates in the output schedule. In Figure 13 (left), we compare the performance of Core-Sp using exact MDDs against the baseline. To compile the exact MDD, we use the incremental iterative algorithm mentioned above. The figure shows that the conditional GAN (cGAN) can only generate around $0.1\\\\%$ of valid routes, mostly due to visiting some locations more than once. Once we apply the post-processing method to the output generated by the baseline (cGAN $^+$ post process), the model’s performance is improved to 50% for the data set $T\\\\,=\\\\,2$ . However, the post-processing method cannot handle the combinatorial complexity of the dispatching problem, as its performance quickly falls close to the baseline cGAN as the number of locations in the daily requested set increases. In contrast, the percentage of valid routes is always $100\\\\%$ using the exact Core-Sp . The percentage of valid routes using relaxed MDDs (width $\\\\leq128$ ) is shown in Figure 13 (right). We can see that Core-Sp still produces over $80\\\\%$ valid routes.  \\n\\nRoute Reward Comparison. To evaluate the neural model’s capability of learning the implicit preferences, we compare the reward function value of the routes generated from the structured prediction algorithms and the ground-truth routes. Notice that the ground truth routes are those that maximize the reward function $R(\\\\cdot)$ , which is defined in Equation (10). We define the normalized reward in the following way:  \\n\\n$$\\n\\\\mathrm{norm-reward}=\\\\frac{1}{N}\\\\sum_{i=1}^{N}\\\\frac{R(\\\\tilde{y})}{R(y)},\\n$$  \\n\\n  \\nFigure 12: (Left) The memory usage of relaxed MDDs. (Right) The percentage of valid routes produced by Core-Sp using relaxed MDDs.  \\n\\n  \\nFigure 13: Our exact Core-Sp models outputs $100\\\\%$ valid routes in the vehicle service dispatching task, while competing approaches, namely conditional GAN (cGAN) and cGAN with post-processing cannot guarantee valid routes. Experiments are carried out with varying maximum numbers of locations in the daily requests. (Left) Exact MDDs are created by the incremental iterative algorithm described in the main text. (Right) Relaxed MDDs are generated with maximum width $2^{20}$ to ensure that the memory consumption is less than 1 GB.  \\n\\nwhere routes $\\\\tilde{y}$ are predicted from the machine learning algorithms and $y$ are the groundtruth routes that maximize the hidden reward function. According to our definition, norm-reward cannot be greater than 1. The closer norm-reward is to 1, the better the generated routes satisfy the hidden preferences of the driver. When computing this metric, we only include valid routes, and we let $N$ be the number of valid routes generated by the algorithm in the test data.  \\n\\nFigure 14 demonstrates the normalized rewards (defined in Equation 11) of the valid routes generated by Core-Sp and the baseline cGAN with post-processing. We can see that both methods can generate routes with a normalized reward score between 0 .6 and 1. Observe that the routes generated by our Core-Sp module have more stable normalized rewards than the cGAN model with post-processing.  \\n\\n  \\nFigure 14: Comparing the normalized reward value of the model for the exact Core-Sp method and cGAN with post-processing, for the vehicle service dispatching application. The hidden driver preferences are reflected by the normalized reward (defined in Equation 11). Core-Sp and cGAN with post-processing both achieve good normalized rewards.', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633686947292, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 13, 'chunk_text': '# 6.2 If-then Program Synthesis\\nDatasets and Metrics. The data sets for this experiment are crawled from the IFTTT and Zapier websites. $^{5,6}$ The statistics of the two data sets are shown in Table 1. The IFTTT data set contains more data samples than the Zapier data set, while the dimensions of the four labels in the Zapier data set are several times larger than those of the IFTTT data set. The sentences in the data set are tokenized by the Spacy library.  \\n\\nTo evaluate the performance of different models on this data set, we consider two metrics: the percentage of valid if-then programs, and accuracy. A program is considered valid if it satisfies our defined Functionality constraints. The accuracy metric is the percentage of predicted programs that match exactly in all four fields with those in the test set. This metric shows the percentage of correctly predicted programs.  \\n\\n<html><body><table><tr><td>Dataset</td><td>#train set</td><td>#val set</td><td>#test s set</td><td>#quadruple</td><td>#vocabulary</td></tr><tr><td>IFTTT</td><td>66761</td><td>4148</td><td>2640</td><td>(111, 443, 88, 161)</td><td>4000</td></tr><tr><td>Zapier</td><td>24454</td><td>4809</td><td>2576</td><td>(1353, 1755, 1333, 1466)</td><td>3782</td></tr></table></body></html>  \\n\\n  \\nTable 1: The statistics for the IFTTT and Zapier data sets.   \\nFigure 15: Percentage of valid programs (left column) and MDD memory consumption (right column) on IFTTT and Zapier data sets. Core-Sp outperforms the state-of-the-art approach LatentAttention (Liu et al., 2016) in generating valid if-then programs. The percentages of valid programs generated by Core-Sp embedding MDDs with different widths are shown for the IFTTT (top left) and Zapier (bottom left) data sets. Core-Sp model that embeds the exact MDD produces $100\\\\%$ valid programs on the two data sets. The relaxed and exact MDD for the IFTTT data set takes less than 4 MB and for the Zaiper data set takes less than $20\\\\ \\\\mathrm{MB}$ memory space.  \\n\\nValid Programs Comparison. Core-Sp significantly boosts the percentage of valid programs generated. In this experiment, we start with evaluating the percentage of valid programs generated from the state-of-the-art LatentAttention model without the CoreSp module. Then we apply the Core-Sp module from Algorithm 1, which iteratively increases the width of the relaxed MDD until we arrive at the exact MDD. Figure 15 shows the performance of all the relaxed and the exact Core-Sp modules when added to the LatentAttention model. Among all programs produced by the LatentAttention model without the Core-Sp layer, around $88\\\\%$ of them are valid on the two data sets. Once we enforce the exact Core-Sp capturing the Functionality constraint, all the programs (100%) produced are valid. We also study the effect of restricting the maximum layer width of the MDDs used in Algorithm 1. We evaluate Core-Sp with MDDs of width-2 up to the largest width, which is width-111 for IFTTT and width-1353 for Zapier. The percentage of valid programs on a separate testing set is shown in the blue lines. The performance of the relaxed Core-Sp increases gradually with the increase of the MDD width.  \\n\\n  \\nFigure 16: The Core-Sp module (red lin brings approximately $1\\\\mathrm{-}2\\\\%$ increase in accuracy for the IFTTT data set and 2% increase for the Zapier data set for the if-then program synthesis task. The LatentAttention model (blue line) is the previous state-of-the-art, which cannot guarantee the validity of the programs generated.  \\n\\nAccuracy Comparison. Figure 16 compares the training set and testing set accuracy for the state-of-the-art LatentAttention model and Core-Sp as the training progresses.  \\n\\n<html><body><table><tr><td></td><td colspan=\"3\">IFTTT</td><td colspan=\"3\">Zapier</td></tr><tr><td>Methods</td><td>Width</td><td>Accuracy</td><td>Valid (%)</td><td>Width</td><td>Accuracy</td><td>Valid 1 (%)</td></tr><tr><td>LatentAttention</td><td>N/A</td><td>42.17%</td><td>87.51%</td><td>N/A</td><td>31.74%</td><td>88.00%</td></tr><tr><td>BestrelaxedCoRE-SP</td><td>80</td><td>44.12%</td><td>99.19%</td><td>1200</td><td>34.28%</td><td>99.53%</td></tr><tr><td>Exact CORE-SP</td><td>111</td><td>43.07%</td><td>100%</td><td>1353</td><td>32.83%</td><td>100%</td></tr></table></body></html>  \\n\\nTable 2: The relaxed and exact Core-Sp modules boost the percentage of valid programs generated and the accuracy for the if-then program synthesis task on both the IFTTT and the Zapier data sets. Exact Core-Sp produces $100\\\\%$ valid programs while Core-Sp with the best relaxed MDD produced by Algorithm 1 leads to the best accuracy in the prediction and close to $100\\\\%$ valid programs.  \\n\\nWe also collect the results of the LatentAttention model without Core-Sp , the model with the best relaxed Core-Sp model (in terms of accuracy) and with the exact Core-Sp model on the two data sets in Table 2. The best relaxed Core-Sp model ves $1-2\\\\%$ higher accuracy than the LatentAttention model and still generates around 11% more valid programs than the LatentAttention model. Similarly, the model with the exact Core-Sp module improves approximately $1\\\\%$ in accuracy but generates $100\\\\%$ valid programs.\\n\\n# 6.3 SQL Query Generation from Natural Language\\nDataset and Metrics. We conduct experiments on the large-scale WikiSQL data set (Zhong et al., 2017), which contains 80 ,654 examples of questions and SQL queries distributed across 24 ,241 tables from Wikipedia. We observe that most of the SQL queries are not complex. Therefore, we further select queries within the data set to form a moderate and a hard test set. The moderate test set consists of those queries containing at least one conditional statement ( i.e. , “ cond-col cond-op cond-val ”). The hard test set is composed of those queries that have at least two conditional statements.  \\n\\nThe metrics applied for this task are: 1) Percentage of valid SQL queries, i.e. , generated queries that satisfy the datatype constraint. 2) Execution accuracy. A generated query is considered correct if the returned value of executing the generated SQL query matches the returned value from the ground truth query. 3) Logical accuracy, which evaluates the percentage of the generated queries that match exactly the ground truth queries in every field. The implementation is based on SQLNova. We use the BERT-base model (Devlin et al., 2019) as the word embedding. The entire model takes up to 3 days to train for 50 epochs. We choose the model that achieves the best execution accuracy on the validation data set for both the baseline and Core-Sp and calculate the corresponding statistics reflected in Table 3.  \\n\\nValid SQL Queries Comparison. As shown in Table 3, SQLNova with the Core-Sp module embedded generates $100\\\\%$ valid SQL programs, demonstrating 0.7% improvement over the original SQLNova model on the full testing set. On the moderate testing set, the improvement increases to $5.7\\\\%$ . On the most difficult hard testing set, the improvement becomes $16.3\\\\%$ . Due to the fact that a majority of the SQL queries in the full test set have empty value at cond-op and $=$ value at sel-op , SQLNova has a high probability to predict  \\n\\nJiang, Zhang, Hoeve and Xue  \\n\\n<html><body><table><tr><td>Accuracy</td><td colspan=\"2\">Full test set</td><td colspan=\"2\">Moderate test set</td><td colspan=\"2\">Hard test set</td></tr><tr><td>per component</td><td>SQLNova</td><td>CORE-SP</td><td>SQLNova</td><td>CORE-SP</td><td>SQLNova</td><td>CORE-SP</td></tr><tr><td>sel-col</td><td>96.3%</td><td>96.3%</td><td>96.4%</td><td>97.0%</td><td>96.6%</td><td>97.7%</td></tr><tr><td>agg-op</td><td>89.8%</td><td>89.7%</td><td>75.7%</td><td>77.8%</td><td>75.4%</td><td>75.8%</td></tr><tr><td>#WHERE</td><td>98.1%</td><td>97.9%</td><td>98.5%</td><td>98.6%</td><td>98.9%</td><td>98.5%</td></tr><tr><td>cond-col</td><td>93.6%</td><td>93.6%</td><td>94.0%</td><td>93.8%</td><td>93.6%</td><td>93.7%</td></tr><tr><td>cond-op</td><td>96.7%</td><td>96.9%</td><td>89.8%</td><td>91.6%</td><td>84.8%</td><td>87.9%</td></tr><tr><td>where-val-idx</td><td>94.5%</td><td>94.8%</td><td>89.4%</td><td>92.3%</td><td>86.7%</td><td>87.5%</td></tr><tr><td>where-val</td><td>94.7%</td><td>94.9%</td><td>89.3%</td><td>92.2%</td><td>86.4$</td><td>87.1%</td></tr><tr><td rowspan=\"2\">Overall Accuracy</td><td colspan=\"2\">Full test set</td><td colspan=\"2\">Moderate test set</td><td colspan=\"2\">Hard test set</td></tr><tr><td>SQLNova</td><td>CORE-SP</td><td>SQLNova</td><td>CORE-SP</td><td>SQLNova</td><td>CORE-SP</td></tr><tr><td>Logical Accuracy</td><td>79.3%</td><td>79.9%</td><td>61.6%</td><td>65.8%</td><td>58.3%</td><td>62.5%</td></tr><tr><td>Execution Accuracy</td><td>85.5%</td><td>86.1%</td><td>75.4%</td><td>79.1%</td><td>76.1%</td><td>78.0%</td></tr><tr><td>Valid SQL</td><td>99.3%</td><td>100.0%</td><td>94.3%</td><td>100%</td><td>83.7%</td><td>100%</td></tr></table></body></html>  \\n\\nTable 3: Core-Sp outperforms the previous state-of-the-art SQLNova on three testing sets in SQL query generation. Core-Sp leads to $100\\\\%$ valid SQL queries generated and increases in both the execution accuracy and the logical accuracy compared with SQLNova for the Text2SQL generation task. The top table shows the accuracy of predicting each field in the SQL queries for both models.  \\n\\nprevalent labels in the data set and coincidentally satisfies the SQL grammar. This is the main reason that our relative improvement is not significant for the full test set.  \\n\\nExecution and Logical Accuracy. Figure 17 compares SQLNova and the exact CoreSp model over execution and logical accuracy metrics as the training progresses. We also collect the accuracy of predicting each field in the SQL queries as shown in the table (top) of Table 3. The execution and logical accuracy are shown at the bottom of Table 3. CoreSp gains improvement for predicting sel-col ,cond-op ,where-val-idx and where-val components. For the other components in the SQL queries, the difference in accuracy between Core-Sp and SQLNova is less than $0.4\\\\%$ . In terms of the execution accuracy, the exact Core-Sp is higher than SQLNova by $0.6\\\\%$ ,$3.7\\\\%$ and $1.9\\\\%$ on the full, moderate, and hard test sets, respectively. In terms of the logical accuracy, the exact Core-Sp is higher than SQLNova by 0 .6%, 4 .2%, and $4.2\\\\%$ for the three testing sets. The improvement in the execution and logical accuracy is due to the fact that the Core-Sp module removes invalid operators during SQL generation and as a consequence reduces the modeling space.', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633719453150, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 14, 'chunk_text': '# 7. Conclusion\\nIn this work, we proposed Core-Sp , an end-to-end neural module that embeds constraint reasoning into machine learning for structured prediction problems. Core-Sp represents the constraints using decision diagrams and filters out invalid solutions. Core-Sp is then embedded into a neural network which can be trained in an end-to-end fashion. We demonstrate the effectiveness of Core-Sp on three structured prediction applications including vehicle dispatching service planning, if-then program synthesis, and Text2SQL generation. We also propose an iterative search algorithm to find the optimal decision diagram structure for these applications. We show that the Core-Sp module improves constraint satisfaction in all three applications. In addition, Core-Sp reduces the modeling space. As a consequence, neural networks with Core-Sp embedded learn faster and generalize better than the pure neural network models. For future work, we plan to generalize Core-Sp in continuous domains and in reinforcement learning.  \\n\\n  \\nFigure 17: The execution accuracy (left) and logical accuracy (right) over training iterations for both Core-Sp and SQLNova. Core-Sp leads to higher execution and logical accuracy throughout the training iterations.\\n\\n# Acknowledgments\\nWe thank all the reviewers for their constructive comments. This research was supported by NSF grants IIS-1850243, CCF-1918327, CCF-1918102, and Office of Naval Research Grant No. N00014-21-1-2240. M. Z. completed this work when he was a master’s student at Purdue University.', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# Test query_by_title API\n",
    "print(\"\\nTesting /query_by_title API...\")\n",
    "title_params = {\"title\": \"Constraint Reasoning Embedded Structured Prediction.\"}\n",
    "call_api(\"query_by_title\", params=title_params)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "bf3e93b9-b77f-407f-a9f5-ee9a8e70f9e3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Testing /query_by_title_contain API...\n",
      "[SUCCESS] query_by_title_contain - Response:\n",
      "[{'id': 454845711343705550, 'paper_id': '6544571e939a5f4082e793a9', 'paper_title': 'CRUSH4SQL: Collective Retrieval Using Schema Hallucination for Text2SQL', 'chunk_id': 0, 'chunk_text': '# CRUSH4SQL: Collective Retrieval Using Schema Hallucination For Text2SQL\\nMayank Kothyari ∗and Dhruva Dhingra and Sunita Sarawagi ∗and Soumen Chakrabarti Department of Computer Science and Engineering Indian Institute of Technology Bombay, Mumbai, India\\n\\n# Abstract\\nExisting Text-to-SQL generators require the entire schema to be encoded with the user text. This is expensive or impractical for large databases with tens of thousands of columns. Standard dense retrieval techniques are inadequate for schema subsetting of a large structured database, where the correct semantics of retrieval demands that we rank sets of schema elements rather than individual elements. In response, we propose a two-stage process for effective coverage during retrieval. First, we instruct an LLM to hallucinate a minimal DB schema deemed adequate to answer the query. We use the hallucinated schema to retrieve a subset of the actual schema, by composing the results from multiple dense retrievals. Remarkably, hallucination — generally considered a nuisance — turns out to be actually useful as a bridging mechanism. Since no existing benchmarks exist for schema subsetting on large databases, we introduce three benchmarks. Two semi-synthetic datasets are derived from the union of schemas in two wellknown datasets, SPIDER and BIRD, resulting in 4502 and 798 schema elements respectively. A real-life benchmark called SocialDB is sourced from an actual large data warehouse comprising 17844 schema elements. We show that our method 1 leads to significantly higher recall than SOTA retrieval-based augmentation methods.', 'original_filename': 'Conf_Paper_Meta_Data_EMNLP_2023_with_whole_text.db'}]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# Test query_by_title_contain API\n",
    "print(\"\\nTesting /query_by_title_contain API...\")\n",
    "title_contain_params = {\"title\": \"Text2SQL\", \"top_k\": 20}\n",
    "call_api(\"query_by_title_contain\", params=title_contain_params)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "acd98e6c-f514-4ce4-bf00-90534ba13b29",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Testing /query_by_chunk_contain API...\n",
      "[SUCCESS] query_by_chunk_contain - Response:\n",
      "[{'id': 454845679145865800, 'paper_id': '6535d747939a5f408295c476', 'paper_title': 'NameGuess: Column Name Expansion for Tabular Data', 'chunk_id': 1, 'chunk_text': '# 1 Introduction\\nTabular data is widely used for storing and organizing information in web ( Zhang and Balog ,2020 )and enterprise applications ( Leonard ,2011 ). One common practice when creating tables in databases is to use abbreviations for column headers due to character length limits in many standard database systems. For example, the maximum length for column names in an SQL database is 256 bytes, leading to the use of abbreviations such as \" D_ID \"for “Department ID” and \" E_NAME \" for “Employee Name” as in Figure 1 . While abbreviations can be convenient for representation and use in code, they can cause confusion, especially for those unfamiliar with the particular tables or subject matter. Column headers are essential for many table-related tasks ( Xie et al. ,2022 ), and using abbreviations makes it challenging for end users to search and retrieve relevant data for their tasks.  \\n\\n<html><body><table><tr><td>Output Logical Name</td><td>Department ID</td><td>Department Name</td><td>Employee Name</td><td>Salary</td><td>Commission</td></tr><tr><td colspan=\"6\"></td></tr><tr><td rowspan=\"4\">Input Query Column Name</td><td>D_ID</td><td>D NAME</td><td>NAME</td><td>SAL</td><td>COMM</td></tr><tr><td>10</td><td>Accounting</td><td>Allen</td><td>14000</td><td>.4</td></tr><tr><td>10</td><td>Accounting</td><td>Ward</td><td>13500</td><td>.3</td></tr><tr><td>30</td><td>Research</td><td>Martin</td><td>12000</td><td>.3</td></tr><tr><td></td><td>20</td><td>Sales</td><td>Turner</td><td>11000</td><td>.3</td></tr><tr><td></td><td>30</td><td>Research</td><td>Smith</td><td>10500</td><td>2</td></tr></table></body></html>\\n\\nTable name: Employee_Salary_2022  \\n\\nAbbreviated column names can negatively impact the usefulness of the underlying data. For example, in the text2SQL semantic parsing task, which converts natural language into formal programs or queries for retrieval, abbreviations can lead to a mismatch with the terms used in the natural language queries. In fact, in the human-labeled text2SQL spider dataset ( Yu et al. ,2018 ), $6.6\\\\%$ of column names are abbreviations. Figure 2 shows an example containing abbreviated column names like \" c_name \" and \" acc_bal \" in tables, which mismatch the terms “the name of all customers” and “account balance”. Simple changes in using abbreviated column names in the spider dataset result in a performance degradation of over ten percentage points, with the exact match score of $66.63\\\\%$ (Xie et al. ,2022 ) dropping to $56.09\\\\%$ on the T5-large model ( Raffel et al. ,2020 ). The effect of the abbreviated column names on table question answering (QA) ( Yin et al. ,2020 ), and column relation discovery ( Koutras et al. ,2021 ) are in Table 1 and description is in Appendix A.1 . The performance degradation emphasizes the need for descriptive column headers in handling tabular data.  \\n\\n<html><body><table><tr><td>Table1: bank</td><td>branch_ID</td><td>bname</td><td>-jo-ou customers</td><td>city</td><td>state</td><td></td></tr><tr><td>Table2: customer</td><td>customer ID</td><td>c_ name</td><td>account _type</td><td>acc_ bal</td><td>no_of_ loans</td><td>credit score</td></tr></table></body></html>  \\n\\nExpanding column names and generating descriptive headers also has other beneficial aspects. First, using expanded column names can increase the readability of tables, especially when complex or technical data is present. The expansion also enables data integration by allowing users to easily distinguish between tables with similar column names but different meanings and helping identify relationships between tables with different abbreviated column names. Finally, expanded column names can also improve the efficacy of keywordbased searches for discovering related tables.  \\n\\nThis work addresses the task of expanding abbreviated column names in tabular data. To the best of our knowledge, this is the first work to introduce and tackle this problem. Unlike previous textual abbreviation expansion works that formulated the task as a classification problem with a predefined set of candidate expansions ( Roark and Sproat ,2014 ;Gorman et al. ,2021 ), we formulate NAME GUESS as a natural language generation problem. Acquiring extensive candidate expansions can be laborious, as pairs of abbreviated and expanded column names are seldom present in the same table. Conversely, abbreviation-expansion pairs can be gleaned from textual data through cooccurrence signals, such as parenthetical expressions. Moreover, abbreviated headers may exhibit ambiguity and polysemy arising from developerspecific naming conventions and domain-related variations in expansions.  \\n\\nTo tackle NAME GUESS , we first built a large dataset consisting of 163,474 tables with 384,333 column pairs and a human-annotated benchmark with 9,218 column pairs on 895 tables. We then proposed a method to produce training data by selectively abbreviating well-curated column names from web tables using abbreviation look-ups and probabilistic rules. Next, we enhanced autoregressive language models with supervised finetuning, conditioned on table content and column headers, and conducted extensive experiments to evaluate state-of-the-art LLMs. The overall model performance is shown in Figure 3 . While GPT-4 exhibited promising performance on NAME GUESS ,the deployment of such LLMs comes with much larger memory and computation overheads. Our findings indicate that supervised fine-tuning of smaller 2.7B parameter models achieves close to human performance, and including table contents consistently improved performance. However, all models found the task challenging, with the best only achieving $54.7\\\\%$ accuracy on the extra-hard examples, indicating room for improvement in expanding abbreviated column names. Our main contributions are:  \\n\\n  \\nFigure 2: An example for Text2SQL semantic parsing. The terms “the name of all customers” and “account balance” do not match the abbreviated column names c_name and acc_bal . Instead, they match with the column names customer_ID and account_type .  \\nFigure 3: Exact match results for fine-tuned models $(^{*})$ ,non-finetuned LLMs, and human performance. Solid and hollow symbols denote inclusion and exclusion of sampled table contents.  \\n\\n1. Introduced a new column name expansion task, named NAME GUESS , as a natural language generation problem,   \\n2. Developed a large-scale training dataset for the NAME GUESS task using an automatic method that largely reduces human effort,   \\n3. Created a human-annotated evaluation benchmark with various difficulty levels, which provides a standard for comparing results,   \\n4. Performed a comprehensive evaluation of LMs of different sizes and training strategies and compared them to human performance on the NAME GUESS task.  \\n\\nTable 1: The effect of abbreviated column names on three table understanding tasks. The performance drops on all the tasks.   \\n\\n\\n<html><body><table><tr><td></td><td>Original ColumnNames</td><td>Abbreviated ColumnNames</td></tr><tr><td>Text2SQL (Match score %)</td><td>66.63</td><td>56.09</td></tr><tr><td>Schema-based RelationDetection (Recall %)</td><td>100.00</td><td>59.50</td></tr><tr><td>TableQA (Accuracy %</td><td>84.32</td><td>80.49</td></tr></table></body></html>', 'original_filename': 'Conf_Paper_Meta_Data_EMNLP_2023_with_whole_text.db'}, {'id': 454845711343705550, 'paper_id': '6544571e939a5f4082e793a9', 'paper_title': 'CRUSH4SQL: Collective Retrieval Using Schema Hallucination for Text2SQL', 'chunk_id': 0, 'chunk_text': '# CRUSH4SQL: Collective Retrieval Using Schema Hallucination For Text2SQL\\nMayank Kothyari ∗and Dhruva Dhingra and Sunita Sarawagi ∗and Soumen Chakrabarti Department of Computer Science and Engineering Indian Institute of Technology Bombay, Mumbai, India\\n\\n# Abstract\\nExisting Text-to-SQL generators require the entire schema to be encoded with the user text. This is expensive or impractical for large databases with tens of thousands of columns. Standard dense retrieval techniques are inadequate for schema subsetting of a large structured database, where the correct semantics of retrieval demands that we rank sets of schema elements rather than individual elements. In response, we propose a two-stage process for effective coverage during retrieval. First, we instruct an LLM to hallucinate a minimal DB schema deemed adequate to answer the query. We use the hallucinated schema to retrieve a subset of the actual schema, by composing the results from multiple dense retrievals. Remarkably, hallucination — generally considered a nuisance — turns out to be actually useful as a bridging mechanism. Since no existing benchmarks exist for schema subsetting on large databases, we introduce three benchmarks. Two semi-synthetic datasets are derived from the union of schemas in two wellknown datasets, SPIDER and BIRD, resulting in 4502 and 798 schema elements respectively. A real-life benchmark called SocialDB is sourced from an actual large data warehouse comprising 17844 schema elements. We show that our method 1 leads to significantly higher recall than SOTA retrieval-based augmentation methods.', 'original_filename': 'Conf_Paper_Meta_Data_EMNLP_2023_with_whole_text.db'}, {'id': 454845711493127644, 'paper_id': '6544571e939a5f4082e793a9', 'paper_title': 'CRUSH4SQL: Collective Retrieval Using Schema Hallucination for Text2SQL', 'chunk_id': 7, 'chunk_text': '# 5.5 Ablation on Collective Retriever\\nCRUSH includes a number of careful design choices. In Table 9 we show the impact of each design choice.  \\n\\n•During retrieval, we obtain the embeddings of a $k\\\\in\\\\kappa$ jointly ith $x$ . In contrast, if we independently embed k, the recall drops significantly. •After retrieval, the overall objective of collective selection (Eq 6 ) incorporates three key ideas: entropy guided similarity, edge scores, and coverage of hallucinated schema elements. We study the impact of each. We remove the entropy discounting in Eqn. (2 ), and observe a drop in recall at low budget levels. When we remove the edge scores, we also see a mild drop. •To study the impact of coverage, we replace the soft-max function $\\\\operatorname{smx}()$ with a simple summation so that for each selected $d\\\\in\\\\mathcal{D}$ reward is just the sum of similarity to each that the recall suffers. A coverage encouraging ∈K . We find objective is important to make sure that the seTable 9: Ablation on design choices of CRUSH on the SpiderUnion dataset. Each row after the first, provides CRUSH with one of the key design elements of CRUSH removed.  \\n\\nTable 8: Effect of temperature changes on recall.   \\n\\n\\n<html><body><table><tr><td></td><td colspan=\"3\">Budget r@3r@5r@10r@20r@30</td></tr><tr><td colspan=\"4\">SpiderUnion</td></tr><tr><td>Recall at temp = 0 Recall at temp = 0.5</td><td>0.59 0.72 0.58 0.70</td><td>0.83 0.90 0.82 0.89</td><td>0.92 0.92 0.91</td></tr><tr><td colspan=\"4\">Recall at temp = 1 0.58 0.69 0.82 0.89</td></tr><tr><td>Recall at temp = O Recall at temp = 0.5 Recall at temp = 1</td><td>SocialDB 0.40 0.52 0.41 0.50 0.36 0.47 0.56 0.63</td><td>0.58 0.69 0.61 0.67</td><td>0.71 0.71 0.70</td></tr></table></body></html>  \\n\\n<html><body><table><tr><td></td><td colspan=\"4\">Budget</td></tr><tr><td>CRUSH</td><td>0.59 0.72</td><td>0.83</td><td>0.90</td><td>0.92</td></tr><tr><td>-c-contextualembedding</td><td>0.53 0.66</td><td></td><td>0.86</td><td>0.90</td></tr><tr><td>一</td><td>0.54</td><td>0.77</td><td></td><td>0.91</td></tr><tr><td>Entropy</td><td>0.67</td><td>0.81</td><td>0.89</td><td></td></tr><tr><td>-Edgescores</td><td>0.57 0.71</td><td>0.83</td><td>0.90</td><td>0.92</td></tr><tr><td>一 -Coverage</td><td>0.54 0.67</td><td>0.81</td><td>0.89</td><td>0.91</td></tr></table></body></html>  \\n\\nlected items are not over-represented by matches to a few $k\\\\in\\\\kappa$ .\\n\\n# 6 Conclusion\\nWhile LLMs incorporate vast world knowledge and corpus statistics, they may be unfamiliar with (possibly private) client DB schemas, which can be very large, rendering impractical or expensive any attempt to upload the full schema in-context along with questions for Text-to-SQL applications. Remarkably, we find a workable middle ground by allowing the LLM to hallucinate a schema from the question and limited in-context examples with no reference to the client schema. Then we formulate a novel collective optimization to map the hallucinated schema to real DB schema elements. The resulting real schema subset that is retrieved has a small size, yet high recall of the fold subset in our experiments involving two new datasets that we also contribute. This schema subset can be readily uploaded to (L)LM-based Text-to-SQL methods. The reduced space of client DB schema elements also improves the accuracy of generated SQL for state-of-the-art Text-to-SQL implementations.\\n\\n# 7 Limitations\\nRemoving two limitations in CRUSH may be useful in future work. First, at present, hallucination is almost completely unguided by the client DB schema. It would be of interest to explore if the client DB schema can be compressed into a very small prompt text to give some limited guidance to the LLM schema hallucinator. At present the link weights $e(d,d^{\\\\prime})$ between schema elements $d,d^{\\\\prime}$ are hardwired; it may be useful to extend the learning optimization to fit these weights.\\n\\n# 8 Acknowledgement\\nWe thank Microsoft for sponsoring access to Azure OpenAI API via the Accelerate Foundation Models Academic Research Initiative. A special thanks to Niti Aayog’s NDAP team for letting us use their data schema and sharing their query workload. Additionally, we extend our thanks to Mayur Datar and Vinayak Borkar for stimulating discussions. We thank IBM’s AI Horizons grant for partly supporting this research. Soumen Chakrabarti was partly supported by grants from IBM and SERB.\\n\\n\\n\\n# CRUSH4SQL: Collective Retrieval Using Schema Hallucination For Text2SQL (Appendix)\\n\\n# A Anecdotes\\nIn Table 10 we show examples of schema retrieved by baseline single embedding method and CRUSH. Observe how retrieved set from single embedding is biased towards matching one of the columns of the hallucinated schema.\\n\\n# BPrompts for SocialDB and BirdUnion dataset\\nIn Table 11 , and Table 12 we show the six few-shot examples used in prompts for schema hallucination for SocialDBand BirdUnion dataset, respectively. The bottom half of the table shows four hallucinated schema obtained from the LLM.  \\n\\nTable 10: Results from single embedding retrieval with OpenAI Vs CRUSH.   \\n\\n\\n<html><body><table><tr><td>C</td><td>What are the ids of students who both have friends and are liked?</td></tr><tr><td>R(q(x))</td><td>network.friend.student_id, network.likes.liked_id</td></tr><tr><td>R(x)</td><td>Single Embedding (OpenAI) college.student.id, student_assessment.students.student_id,</td></tr><tr><td></td><td>school_player.school.school_id, school_player.school_performance.school_id, student_assessment.student_course_attendance.student_id, student_assessment.candidate_assessments.candidate_id, student_assessment.candidates.candidate_id, voter.student.stuid network_1.likes.student_id,</td></tr><tr><td>R(c)</td><td>school_player.school.boys_or_girls CRUSH</td></tr><tr><td></td><td>student_assessment.students.student_id network_1.likes.student_id e_learning.students.student_id network_1.friend.friend_id network_1.friend.student_id network_1.likes.liked_id student_assessment.students.student_details student_assessment.student_course_attendance.student_id student_assessment.student_course_registrations.student_id network_2.personfriend.friend]</td></tr></table></body></html>  \\n\\n<html><body><table><tr><td>#</td><td>LLM prompt: Hallucinate a minimal schema of a relational database that can be used to answer the natural language</td></tr><tr><td>c K</td><td>question. Here are some examples: What is the correlation between child nourishment and parental education in the state of Madhya Pradesh? Family_health_survey(child age, child nourishment), Population_census( state, age-group, male literate population,</td></tr><tr><td></td><td>female literate population) Health center per population ratio at the village level or district level from the year 2015?</td></tr><tr><td>K C</td><td>Health_infrastructure(village, health care facility), Population_census(district, male population, female population) Distribution of medical professionals by type across regions from 2011 onwards from the state of Kerala.</td></tr><tr><td>K</td><td>Health_statistics_statewise(medical professional) Correlation between road connectivity and Mother Mortality Rate (MMR) during 2011 from the state UK.</td></tr><tr><td>C K</td><td>Family_health_survey(state, year, maternal mortality), Road_statistics(state, road type)</td></tr><tr><td>C</td><td>What is the trend for CPI of goods excluding food and fuel?</td></tr><tr><td>K</td><td>Inflation_money_and_credit(year, Categories of Consumer Expenditure)</td></tr><tr><td>C</td><td>Correlation between number of bank branches and district growth?</td></tr><tr><td></td><td>Town_amenities_census(amenities, public works department), bank_details(number of branches, bank type)</td></tr><tr><td></td><td>Hallucinated K generated by LLM given input c Which Central Public Sector Enterprise generated most employment in the 10 years?</td></tr><tr><td>K</td><td></td></tr><tr><td>C</td><td></td></tr><tr><td></td><td>health_statistics_statewise(state, year, caesarean section births), women_awareness_survey(state, year, awareness level)</td></tr><tr><td>C K</td><td>what is the correlation between socio-economic status and health insurance enrollments?</td></tr><tr><td>C</td><td>socio_economic_status(income, education level), health_insurance_enrollment(age, gender, income level)</td></tr><tr><td></td><td>per 2009-10 over a period of 5 years?</td></tr><tr><td></td><td>export_data(year, commodity, country, total export volume), commodity_classification(commodity, new commodity classification)</td></tr></table></body></html>  \\n\\nTable 11: Examples of in-context training examples (for SocialDB) given to the LLM to prompt it to hallucinate a minimal schema of a database that can be used to answer the given question.   \\n\\n\\n<html><body><table><tr><td>#</td><td>LLM prompt: Hallucinate a minimal schema of a relational database that can be used to answer the natural language</td></tr><tr><td></td><td>question. Here are some examples: What is the brand of the truck that is used to ship by Zachery Hicks?</td></tr><tr><td>c K</td><td>truck(truck_id, make), shipment(truck_id, driver_id), driver(driver_id, first_name, last_name)</td></tr><tr><td>c</td><td>State the name of the citywhereJoseRodriguezworks.</td></tr><tr><td></td><td>employee(locationID, firstname, lastname), location(locationID, locationcity)</td></tr><tr><td>C</td><td>Please list all horror films that have a rating of 1.</td></tr><tr><td>K</td><td>u2base(movieid, rating), movies2directors(movieid, genre)</td></tr><tr><td>c</td><td>List all the names of the books written by Danielle Steel.</td></tr><tr><td>K</td><td>book(book_id, title), book_author(book_id, author_id), author(author_id, author_name)</td></tr><tr><td>C</td><td></td></tr><tr><td>K</td><td>How many female representatives are there in Michigan?</td></tr><tr><td>C</td><td>current(bioguide_id, bioguide,gender_bio),current_terms(bioguide, type, state) How many stars does each of the 3 top users with the most likes in their reviews have?</td></tr><tr><td></td><td>Tips(user_id, likes), Users(user_id, user_average_stars)</td></tr><tr><td colspan=\"2\">Hallucinated K generated by LLM giveninput r</td></tr><tr><td>C</td><td>Which country had the gas station that sold the most expensive product id No.2 for one unit?</td></tr><tr><td>K</td><td></td></tr><tr><td></td><td>Please list the titles of theposts owned by the user csgillespie?</td></tr><tr><td>K</td><td>Posts(post_id, title, user_id), Users(user_id, username)</td></tr><tr><td>C K</td><td>Which country is the constructor which got 1 point in the race No. 24 from?</td></tr><tr><td>C</td><td>race(race_id, constructor_id), points(race_id, constructor_id, points), constructor(constructor_id, country) What is the administrator\\'s email address for the school with the highest number of test takers who received SAT scores</td></tr><tr><td></td><td>of atleast1500?Providethenameof theschool.</td></tr><tr><td></td><td>Schools(school_id, school_name, administrator_email), Test_takers(school_id, SAT_score)</td></tr></table></body></html>\\n\\nTable 12: Examples of in-context training examples (for BirdUnion) given to the LLM to prompt it to hallucinate a minimal schema of a database that can be used to answer the given question.', 'original_filename': 'Conf_Paper_Meta_Data_EMNLP_2023_with_whole_text.db'}, {'id': 454846633277740484, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 1, 'chunk_text': '# 1. Introduction\\nThe emergence of large-scale constraint reasoning and machine learning technologies have impacted virtually all application domains, including marketing, linguistics, operations, retail, robotics, and health care. Constraint reasoning has traditionally been applied to building prescriptive models that generate solutions for strategic, tactical, or operational use (Choi et al., 2012). It requires a precise problem description and is usually difficult to be made flexible to the evolving data distributions. Machine learning, on the other hand, has been applied primarily to build predictive models, such as classifications or regressions (Michalski and Anderson, 1984; Bishop, 2007). While the structure of a machine learning model (like a neural network) must be designed, the actual model parameters are learned automatically via gradient descent algorithms. This gives machine learning models the flexibility to adapt to the evolving data distributions. Nevertheless, it is difficult to enforce constraints on the output of machine learning models. Many real-world applications are beyond the reach of constraint reasoning or machine learning alone.  \\n\\n  \\nFigure 1: (a) Our proposed Core-Sp framework embeds constraint reasoning in machine learning for structured prediction. We demonstrate the effectiveness of Core-Sp on vehicle dispatching service, if-then program synthesis, and Text2SQL generation tasks. (b) At a high level, Core-Sp (in orange colored box) is a fully differentiable layer that simulates a path descending in the corresponding decision diagram. Core-Sp filters out the infeasible output from the structured output to ensure constraint satisfaction.  \\n\\nIn this paper, we focus on structured prediction problems, which is a class of learning problems requiring both constraint reasoning and machine learning. It expands the output space of classification problems into high-dimensional structured space. Structured prediction has diverse application domains, ranging from natural language processing (Socher et al., 2013), social network analysis (Xiang and Neville, 2013), and ecological modeling (Tang et al., 2018; Chen et al., 2018). The applications we consider in this paper all require tight integration of constraint reasoning and machine learning. Our first application vehicle dispatching service planning is to recommend a route that satisfies the daily service needs as well as meeting the drivers’ preferences. Historical data may reveal that the drivers do not follow common stylized objectives such as minimizing distance or time. Therefore standard constraint reasoning tools, e.g. , solvers for the traveling salesman problem, cannot be applied. While we need machine learning to capture the drivers’ objective functions, pure machine learning-based approaches are insufficient because they often generate routes that violate delivery requests. Our second and third applications are program synthesis from natural language , which clearly requires machine learning to generate structured programs. Nevertheless, a pure learning approach cannot enforce the syntactic and semantic rules of those programs.  \\n\\nWe propose Co nstraint Re asoning embedded Structured Prediction ( Core-Sp ), a scalable constraint reasoning and machine learning integrated approach for learning over the structured domains. The main idea is to augment structured predictive models with a constraint reasoning module that represents physical and operational requirements. Specifically, we propose to embed decision diagrams (Akers, 1978; Bryant, 1986), a popular constraint reasoning tool, as a fully-differentiable module into deep neural networks. A decision diagram is a compact graphical representation of the constraints. It encodes each solution (an assignment of values to variables satisfying the constraints) as a path from the root to the terminal in the diagram. Core-Sp regards the neural network predictions as the simulation of descending along a path in the decision diagram. To ensure constraint satisfaction, Core-Sp filters out variable assignments from the neural network predictions that violate constraints. With the integration of Core-Sp , we provide structured prediction models with constraint satisfaction assurances. Moreover, structured prediction models with the Core-Sp layer enjoy a smaller prediction space than traditional structured prediction approaches, allowing our approach to learn faster in training and generalize better in testing. See Figure 1(a) for our proposed Core-Sp model which integrates constraint reasoning and machine learning for the three application domains. The high-level idea of Core-Sp is illustrated in Figure 1(b).  \\n\\nPrevious approaches have considered regularizing machine learning with constraint reasoning in various application domains. Within the broader context of learning constrained models, the work of Coletta et al. (2003); Lallouet et al. (2010); Beldiceanu and Simonis (2012); Bessiere et al. (2017); Addi et al. (2018) have studied automating the constraint acquisition process from historic data or (user-)generated queries. These approaches use partial or complete examples to identify the constraints that can be added to the model. The type of constraints that can be learned depends on the formulation. Several works (Punyakanok et al., 2004; Roth and Yih, 2005; Amos and Kolter, 2017; Ferber et al., 2020) enable learning in a constrained domain via encoding mathematical programming, such as quadratic programming or mixed integer linear programming, as a neural network layer. Deutsch et al. (2019) propose to formulate the output space as an automata. They use the constraints to prune all the invalid transitions in the automata to ensure the validity of the structured outputs. In addition, constraints imposed by a knowledge graph have been embedded into the neural network as differentiable layers (Peters et al., 2019; Wu et al., 2017). Zeng et al. (2021) and Heim (2019) enforce physical constraints or expert inputs as soft constraints. We will illustrate the difference between our approach and these methods in Section 3.2. A different approach is to embed a machine learning model into optimization, e.g. , by extending a constraint system with appropriate global constraints. For example, Lallouet and Legtchenko (2007) integrate neural networks and decision trees with constraint programming, while Lombardi et al. (2017) and Lombardi and Gualandi (2016) introduce a “Neuron” global constraint that represents a pre-trained neural network. Another series of approaches based on grammar variational autoencoders (Kusner et al., 2017; Dai et al., 2018; Jin et al., 2018) use neural networks to encode and decode from the parse-tree of a context-free grammar to generate discrete structures. Such approaches are used to generate chemical molecule expressions, which represent a structured domain. Machine learning approaches have also been used to solve constraint reasoning and optimization problems. This includes the works of Galassi et al. (2018) and Vinyals et al. (2015), which use neural networks to extend partial solutions to complete ones. Bello et al. (2017) handle the traveling salesman problem by framing it as reinforcement learning. Selsam et al. (2019) proposes to learn an SAT solver from single-bit supervision. Approaches based on neural Turing machines (Graves et al., 2016) employ neural networks with external memory for discrete structure generation. More recently, Khalil et al. (2017) tackle the combinatorial optimization problems in graphs, by employing neural networks to learn the heuristics in the backtrack-free search. There is also a recent trend to synthesize programs using machine learning (Guu et al., 2017; Shi et al., 2019).  \\n\\nIn experimental analysis, we demonstrate the effectiveness of Core-Sp on the following three applications: (1) Vehicle Dispatching Service Planning : a route planning problem that recommends routes to drivers to meet the service needs while satisfying the drivers’ preferences. The implicit preferences of drivers are learned from the historical traveling data. The input of this problem is the daily service requests. The output is the permutations of the service locations, representing the sequential order that the locations should be visited by the drivers. This task requires machine learning models to capture drivers’ preferences from the traveling data, and constraint reasoning to ensure the satisfaction of service requests. (2) If-then Program Synthesis : the task is to automatically synthesize conditional programs from the natural language. Automatic program synthesis tools are useful to streamline the program of a few online services such as IFTTT and Zapier. The if-then program is in the form of: if trigger function happens in the trigger service , then take the action function from the action service . The machine learning task, therefore, is to predict the quadruple ( trigger service ,trigger function ,action service ,action function ). This application again requires machine learning to understand the semantics of the natural language, as well as constraint reasoning to satisfy the syntactic rules of the programs. (3) Text2SQL Generation : our last application is to automatically generate SQL queries that extract information from a database to answer a question posed in natural language. The neural model is used to understand the user’s queries in natural language while the constraint reasoning tool is applied to ensure the model generates grammaticallyvalid SQL queries.  \\n\\nOur proposed Core-Sp framework demonstrates superior performance against the stateof-the-art approaches in all three applications. First, the structures generated by Core-Sp are better in constraint satisfaction. In vehicle service dispatching, all Core-Sp generated routes are valid, while a conditional generative adversarial network (cGAN) without CoreSp generates on average less than $1\\\\%$ of valid routes when handling medium-sized delivery requests. We also apply a post-processing step (Deudon et al., 2018) to boost cGAN’s performance, but it cannot handle the complexity brought by the large combinatorial space of the routing problem. Its performance quickly defaults to the case without post-processing as the number of delivery locations increases. For if-then program synthesis, the percentage of valid programs produced increased from 88% to 100% with the Core-Sp module incorporated into the state-of-the-art LatentAttention model (Liu et al., 2016). For Text2SQL, the percentage of valid SQL queries increased from 83 .7% to 100% with Core-Sp incorporated into the state-of-the-art SQLNova model (Hwang et al., 2019) on a hard testing set. Core-Sp also improves the learning performance of structured prediction models. We show that the routes generated by Core-Sp better fulfill drivers’ preferences than cGAN without Core-Sp . In if-then program synthesis, Core-Sp module leads to approximately $2.0\\\\%$ improvement in accuracy compared with the state-of-the-art LatentAttention model and converges to models with higher accuracy in fewer training epochs. In Text2SQL generation, the Core-Sp module improves around 4 .2% in execution accuracy and 1 .9% in logical accuracy against SQLNova on a challenging test set.', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633313916358, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 2, 'chunk_text': \"# 2.1 Structured Prediction\\nStructured prediction expands the output space of classification problems into a highdimensional combinatorial space (Bakır et al., 2007). Specifically, given a set of inputoutput samples $\\\\mathcal{D}^{t r}\\\\,=\\\\,\\\\{({\\\\boldsymbol{x}}^{(i)},{\\\\boldsymbol{y}}^{(i)})\\\\}_{i=1}^{N}$ drawn i.i.d. from some unknown distribution over the space $\\\\mathcal X\\\\times\\\\mathcal Y$ , a structured pr tion model learns a conditional distribution $p_{\\\\theta}(y|x)$ ,for all $(x,y)\\\\,\\\\in\\\\,\\\\mathcal{X}\\\\times\\\\mathcal{Y}$ from data D$\\\\mathcal{D}^{t r}$ , where $\\\\theta$ denotes the parameters of the structured prediction model. Note that the output space ${\\\\boldsymbol{\\\\mathcal{D}}}=\\\\{0,1\\\\}^{\\\\iota}$ is a high dimensional space of combinatorial structures. The three applications we consider in this paper are all structured prediction problems. In vehicle dispatching service planning, the structured outputs are the delivery routes on a map. In if-then program synthesis, the structured outputs are the programs that complete web-service tasks. In Text2SQL generation, the structured outputs are the SQL queries that follow the SQL grammar.  \\n\\nIn the literature, various approaches have been proposed for structured prediction problems. The classifier chain approach (Read et al., 2015) decomposes the joint likelihood into a product of conditionals and reduces the structured prediction problem into a series of binary prediction problems. In this approach, the error tends to propagate along the classifier chain, which limits its effectiveness (Dembczynski et al., 2010). Energy-based modeling, such as conditional random fields (Lafferty et al., 2001; Geman and Geman, 1984) and structured prediction energy networks (Belanger and McCallum, 2016) learn to assign a high likelihood to structures that exist in the training data set while keeping the likelihood low for unseen structures. Constraints can be incorporated into these models as prior terms in the energy function but approximated inference is required to compute the intractable partition function, which often hinders their scalability. Another line of research uses structured support vector machines (Tsochantaridis et al., 2005), which apply hinge loss and row generation approaches for structured prediction; however, these were superseded in performance by later neural-network-based approaches. Recently, generative models, such as conditional generative adversarial networks (Mirza and Osindero, 2014; Goodfellow et al., 2014), flow models (Rezende and Mohamed, 2015), and sequence-to-sequence models (Sutskever et al., 2014) have become increasingly popular for structured prediction. These models use highly flexible neural networks to increase model capability. The over-parameterized networks with gradient descent-based optimization can learn better representation for the structures than the classic shallow models. However, it is not straightforward to enforce constraints into the neural network-based models.  \\n\\nConstraints in Structured Prediction. Often the structured output space $\\\\mathcal{V}$ is subject to additional constraints $\\\\scriptscriptstyle\\\\mathcal{C}$ . The conditional probability that $y$ takes values that violate the (physical) constraints $\\\\mathcal{C}$ given the input $x$ is zero. Such information is known prior to the training of the machine learning model. Formally, we have:  \\n\\n$$\\np(y|x)\\\\left\\\\{\\\\int>0\\\\quad{\\\\mathrm{if~}}y{\\\\mathrm{~satisfies~}}{\\\\mathcal{C}},\\\\right.\\n$$  \\n\\nTake the first task discussed in this paper as an example. A valid delivery route should cover all the requested locations and should only visit each location once. Thus, the machine learning model should assign zero probability to those invalid routes. Notice that the constraints are often intricate and the inference problem of finding a valid structure satisfying constraints cannot be decomposed into independent small problems. After learning, the inference problem is to predict the structured output $y$ given the input $x$ . Such inference problems can be solved by either Maximum A Posteriori (MAP) inference, e.g. , computing $m a x_{y}\\\\ p(y|x)$ or marginal inference, e.g. , computing $\\\\mathbb{E}_{y}[p(y|x)]$ . Learning structured prediction models involves solving the inference problems within the learning loop, hence having an even higher complexity.  \\n\\nCombinatorial constraints render both the inference and the learning problems highly intractable. Indeed, much effort has been made to improve the efficiency of both the inference and learning problems (Pan and Srikumar, 2018; Bello et al., 2020). For example, Niculae et al. (2018) propose the sparseMAP function which solves the inference problem by returning a few sparse structures that attain high likelihoods. This inference method sits between the MAP and marginal inference. In their problem setup, sparseMAP can be solved via quadratic programming. However, combinatorial constraints considered in this paper make the inference problem non-convex, even for a fixed structured prediction model, let alone the more challenging learning problem. Overall, constrained structured prediction presents two main challenges. The first is the sample complexity , since massive data is needed to learn an accurate model in an exponentially large space. The second is the computational complexity , since it is combinatorially intractable (unless P=NP) to generate structured outputs subject to complicated constraints.  \\n\\nSequence-to-sequence Structured Prediction. Our proposed Core-Sp method is designed to extend sequence-to-sequence models, which are recently proposed popular structured prediction models (Sutskever et al., 2014). The sequence-to-sequence model uses the re-parameterization trick to model the conditional probability $p_{\\\\theta}(y|x)$ , where $x\\\\in\\\\mathscr{X}$ denotes the input variables and $y\\\\in\\\\mathcal{V}$ is the structured output. Here $\\\\theta$ denotes the parameters of the neural model. Instead of modeling the probability $p_{\\\\theta}(y|x)$ directly, the model introduces an additional random variable $\\\\mathcal{Z}$ and models it as a deterministic transformation from random variable $\\\\mathcal{Z}$ and evidence $x$ to the output $y$ . In other words, the conditional probability $p_{\\\\theta}(y|x)$ is an integral over random variable $z$ in the following way:  \\n\\n$$\\n\\\\begin{array}{c}{{p_{\\\\theta}(y|x)=\\\\displaystyle\\\\int p_{\\\\theta}(y|x,z)p(z)\\\\;d z,}}\\\\\\\\ {{p_{\\\\theta}(y|x,z)=\\\\mathbb{1}\\\\{y=f_{\\\\theta}(x,z)\\\\},}}\\\\end{array}\\n$$  \\n\\nwhere we assume $\\\\mathcal{Z}$ is from a known prior probability distribution $p(z)$ . As a result, we only need to model $p_{\\\\theta}(y|x,z)$ for the overall model $p_{\\\\theta}(y|x)$ . We further assume that $p_{\\\\theta}(y|x,z)$ is given in the form of a deterministic function. We let $f_{\\\\theta}(x,z)\\\\in\\\\mathcal{D}$ be a deterministic mapping from inputs $(x,z)$ to an output in the structured space $\\\\boldsymbol{y}$ . The indicator function $\\\\mathbb{I}\\\\{\\\\cdot\\\\}$ evaluates to $1$ if and only if $y=f_{\\\\boldsymbol{\\\\theta}}(x,z)$ . This formulation is closely related to the generative adversarial network and gives us high flexibility to model multi-modal distributions. Take the vehicle dispatching service planning as an example. The input $x$ is the daily service requests and $y$ is the suggested dispatching route. There can be several routes that meet the service demands and satisfy the driver’s underlying preference function. In this case, the conditional probability $p_{\\\\theta}(y|x)$ may have multiple modes, one for each good route. This formulation allows us to represent the multi-modal distribution effectively. The variable $z$ decides which route to pick. The function $f_{\\\\boldsymbol{\\\\theta}}(x,z)$ returns one route that meets the demand of input $x$ and is randomly selected by $\\\\mathcal{Z}$ . If $p_{\\\\theta}(y|x)$ has $k$ modes, the space of $z$ will be split into $k$ regions where variable $z$ in every region will be mapped to one mode in $p_{\\\\theta}(y|x)$ .  \\n\\nWe use a sequence-to-sequence neural network to model the function $f_{\\\\theta}(x,z)$ . Assume the input variables $x,\\\\ z$ , and the output $y$ are all represented in sequential forms $x=$ $(x_{1},x_{2},\\\\ldots,x_{T})$ ,$z\\\\,=\\\\,(z_{1},z_{2},\\\\dots,z_{T})$ and $y\\\\,=\\\\,(y_{1},y_{2},\\\\ldots,y_{T})$ .The sequence-to-sequence model is made of an encoder and a decoder. The sequential encoder receives $x$ and outputs a representation vector for input $x$ .The sequential decoder receives the output of the encoder as well as $z$ and outputs $y$ in $T$ steps, where $T$ refers to the maximum length for variable $y$ . In the $k$ -th step ( $1\\\\leq k\\\\leq T$ ), the decoder network takes $z_{k}$ , and the hidden vector $h_{k-1}$ from the previous step as inputs, and outputs a score vector $o_{k}=(o_{k1},o_{k2},\\\\dots,o_{k D_{k}})$ of length $D_{k}\\\\;=\\\\;|D(y_{k})|$ .Here, $o_{k}$ corresponds to the un-normalized likelihoods of each value that variable $y_{k}$ can take. The softmax function is then applied to get the normalized probability:  \\n\\n$$\\np_{k j}=p\\\\left(y_{k}=v_{j}|x,h_{k-1}\\\\right)={\\\\frac{\\\\exp(o_{k j})}{\\\\sum_{j^{\\\\prime}=1}^{D_{k}}\\\\exp(o_{k j^{\\\\prime}})}},\\\\qquad{\\\\mathrm{for~}}j=1,2,\\\\ldots,D_{k}.\\n$$  \\n\\n$p_{k j}$ is the probability that variable $y_{k}$ takes the $j$ -th value $v_{j}$ . Assume the prior distribution $p(z_{k})$ is the uniform distribution in $(0,1)$ , denoted by $\\\\mathcal{U}(0,1)$ . Variable $z_{k}$ is sampled from $\\\\mathcal{U}(0,1)$ and is used to determine the value for $y_{k}$ according to the probability distribution vector $p_{k}=(p_{k1},p_{k2},...\\\\,,p_{k D_{k}})$ . Let $P_{k1},P_{k2},...,P_{k(D_{k}+1)}$ be the cumulative probabilities:  \\n\\n$$\\nP_{k j}=\\\\left\\\\{\\\\!\\\\!\\\\begin{array}{l l}{0}&{\\\\mathrm{for~}j=1,}\\\\\\\\ {\\\\sum_{j^{\\\\prime}=1}^{j-1}p_{k j^{\\\\prime}}}&{\\\\mathrm{for~}j=2,3,...\\\\,,D_{k},}\\\\\\\\ {1}&{\\\\mathrm{for~}j=D_{k}+1.}\\\\end{array}\\\\!\\\\!\\\\right.\\n$$  \\n\\n$y_{k}$ is set to t e$v_{j}$ if and only if $z_{k}\\\\ \\\\in\\\\ \\\\left[P_{k j},P_{k(j+1)}\\\\right)$ '\\x01.Notice that because $z_{k}$ is sampled from U$\\\\mathcal{U}(0,1)$ 1), the probability that $y_{k}$ takes the $j$ -th value $v_{j}$ is exactly $p_{k j}$ . Aside from producing the value for $y_{k}$ in the $k$ -th step, the sequence-to-sequence neural net also produces the hidden-state vector $h_{k}$ at the $k$ -th step, which is used by the neural net again in the subsequent $(k+1)$ -th step. The overall architecture of the sequence-to-sequence model can be seen in Figure 4.  \\n\\nThe training process of the sequence-to-sequence model is to minimize a pre-defined loss function, or an additional discriminator neural net, which penalizes the differences of the predicted structure $f_{\\\\theta}(x,z)$ and the observed structure $y$ . Here $f_{\\\\theta}(x,z)$ is a predicted sequence obtained from the above process. Given a training data set $\\\\mathcal{D}^{t r}=\\\\{(\\\\boldsymbol{x}^{(i)},\\\\boldsymbol{y}^{(i)})\\\\}_{i=1}^{N}$ ,the learning objective is to minimize the loss function:  \\n\\n  \\nFigure 2: Illustration of Multi-valued Decision Diagrams (MDDs) for decision variables $x_{1},x_{2},x_{3}$ .(a) An exact MDD with all variable assignments satisfying two constraints: all-diff $(x_{1},x_{2},x_{3})$ and $x_{1}\\\\neq v_{1}$ .(b) A width-1 relaxed MDD for the exact MDD in (a). (c) A width-2 relaxed MDD, which is formed by combining nodes $u_{4}$ and $u_{5}$ of the MDD in (a).  \\n\\n$$\\n\\\\mathcal{L}(\\\\theta)=\\\\frac{1}{N}\\\\sum_{i=1}^{N}\\\\mathbb{E}_{z^{(i)}}\\\\left[\\\\ell\\\\left(f_{\\\\theta}\\\\left(x^{(i)},z^{(i)}\\\\right),y^{(i)}\\\\right)\\\\right].\\n$$  \\n\\nHere $\\\\ell(\\\\cdot,\\\\cdot)$ can be a predefined loss function that measures the mismatch between the predicted and observed structures. Function $\\\\ell(\\\\cdot,\\\\cdot)$ can also be represented as a discriminator network, which leads to the training of a generative adversarial network. The parameters $\\\\theta$ are updated via gradient descent, i.e. ,$\\\\theta^{t+1}=\\\\theta^{t}{-}\\\\eta\\\\nabla{\\\\mathcal{L}}(\\\\theta)$ , where $\\\\eta$ denotes the learning rate.\", 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633586283990, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 10, 'chunk_text': '# 5.3 Text2SQL Generation\\nTask Definition. Formatted data such as travel records and stock market transactions are stored in relational databases. Currently, accessing the database requires a data scientist who masters the SQL query language. Our task is to automatically synthesize SQL queries from natural language sentences using machine learning. Compared with the data expert approach, SQL query generation requires deeper reasoning across the structure of the database, the semantics of the structured query language, and the understanding of natural language. As shown in Figure 11, the input of the text2SQL generation is a sentence that describes the query in natural language and the table headers in the relational database. The output is a SQL query with the following structure:  \\n\\nSELECT agg-op sel-col WHERE (cond-col cond-op cond-val) AND ...  \\n\\nHere, SELECT and WHERE are keywords in the SQL language. What we need to predict are: (1) the aggregation operator $\\\\mathsf{a g g-o p}$ , which chooses among the set {empty, COUNT, MIN, MAX, SUM, AVG }; (2) the column name in selection sel-col and (3) the column name in condition cond-col , both of which are chosen from the table headers; (4) the conditional operator cond-op , which is in $\\\\{=,<,>\\\\}$ ; (5) the conditional value cond-val , which is assumed to be a sub-sequence of the given query. Here, one bracket pair () represents one conditional statement. The SQL query may have multiple conditions, which are denoted above by “ ... ”. Figure 11 displays this SQL query:\\n\\n# SELECT COUNT \"School\" WHERE \"No.\" = \"3\"\\nHere agg-op is COUNT ;sel-col is “school”, which is a column name from the table headers. One cond-col is “No.”, which also comes from the table headers. The cond-op is “=”. The cond-val is “3”, which we assume is from the input query. This example has one condition but multiple conditions are allowed.  \\n\\nDefinition of Constraints. Existing generative neural models for this task are not guaranteed to generate a query that follows the grammar of a SQL query. To avoid grammar violations, we compile a set of common SQL grammars as constraints into the Core-Sp module. The Core-Sp module will ensure that all the generated SQL queries follow the grammatical constraints. Our constraints are defined on the operators, namely the conditional operator cond-op and the aggregation operator agg-op . The domains of these operators are dependent upon the data types of the entities (namely, cond-col and sel-col )they operate on. Consider the previous example. The agg-op can only take values between $\\\\{\\\\mathrm{empty,~\\\\coUNT}\\\\}$ , because the sel-col is “school”, which is of the string type. More precisely, let $s$ be a column header (the value of sel-col or cond-col ). We define $F_{a}(s)$ as  \\n\\nInput Table:   \\n\\n\\n<html><body><table><tr><td></td><td>Player</td><td>No.</td><td>Position</td><td>School</td></tr><tr><td>0</td><td>Antonio</td><td>21</td><td>Guard-Forward</td><td>Duke</td></tr><tr><td>1</td><td>Voshon</td><td>2</td><td>Guard</td><td>Minnesota</td></tr><tr><td>2</td><td>Marin</td><td>3</td><td>Guard-Forward</td><td>Butler CC</td></tr></table></body></html>\\n\\n# Input Query:\\nHow many schools did player number 3 play at?\\n\\n# Output SQL Query:\\nFigure 11: An example for the Text2SQL generation task. The input is the text query “How many schools did player number 3 play at?” and the table header “ Player, No., Position, School ” from the relational database. The output should be the SQL query: SELECT COUNT \"School\" WHERE \"No. $\"~=~\"3\"$ .  \\n\\nthe set of aggregation operators agg-op that can be associated with $s$ , and $F_{c}(s)$ as the set of condition operators cond-op that can be associated with $s$ . That is:  \\n\\n$$\\n\\\\begin{array}{r l}&{F_{a}(s)=\\\\left\\\\{\\\\begin{array}{l l}{\\\\{\\\\mathrm{empty~,~\\\\varsigma0UNT,~\\\\forall\\\\mathrm{IIN},~\\\\forall\\\\mathrm{IAX},~\\\\forall\\\\mathrm{II},~\\\\mathrm{AVG}\\\\}}}&{\\\\mathrm{if~}s\\\\mathrm{~of~is~numeric~type}}\\\\\\\\ {\\\\{\\\\mathrm{empty~,~\\\\varsigma0UNT}\\\\}}&{\\\\mathrm{if~}s\\\\mathrm{~of~is~string~type}}\\\\end{array}\\\\right.}\\\\\\\\ &{F_{c}(s)=\\\\left\\\\{\\\\begin{array}{l l}{\\\\{\\\\mathrm{=,~\\\\displaystyle>,~\\\\varsigma\\\\}}}&{\\\\mathrm{if~}s\\\\mathrm{~is~of~numeric~type}}\\\\\\\\ {\\\\{\\\\mathrm{=}\\\\}}&{\\\\mathrm{if~}s\\\\mathrm{~is~of~string~type}}\\\\end{array}\\\\right.}\\\\end{array}\\n$$  \\n\\nWe also introduce dataype constraints, which are defined as:  \\n\\n$$\\n\\\\begin{array}{r l}&{\\\\mathtt{s e l-c o l}=s\\\\Rightarrow\\\\mathtt{a g g-o p}\\\\in F_{a}(s),}\\\\\\\\ &{\\\\mathtt{c o n d-c o l}=s\\\\Rightarrow\\\\mathtt{c o n d-o p}\\\\in F_{c}(s).}\\\\end{array}\\n$$  \\n\\nModel Structure. We embed the Core-Sp module to SQLova (Hwang et al., 2019), the state-of-the-art neural network for text2SQL generation. SQLova has a sequence-tosequence architecture. It first encodes a natural language sentence and the table headers into a high-dimensional vector. Then the decoder of SQLova decodes the hidden representation into the predictions of various entities in the SQL query. SQLova first determines the number of conditions in the SQL query and then fills in the ( cond-col ,cond-op ,cond-val ) for each condition. The operators agg-op, cond-op are predicted as a classification task from a fixed set of operators. Column names cond-col, sel-col are predicted from the set of table headers in the relational database. The cond-val is predicted by a pointer neural network which points at a span of the input natural language sentence. The selected span of the query is used as the cond-val (Dong and Lapata, 2018).  \\n\\nMDD Construction. The associated MDD that encodes the constraints for text2SQL generation is similar to the MDD for if-then program synthesis. The MDD is split into layers and every two layers form a group. One two-layer group is used to enforce constraints on an operator-column name pair. The operator-column name pair can be $\\\\mathsf{a g g-o p}$ and sel-col ,or can be cond-op and cond-col . Note that there can be only one group of $\\\\mathsf{a g g-o p}$ and sel-col and more than one group of cond-op and cond-col . In the first layer of the group, the column name is determined. In the second layer, the invalid operators are ruled out based on the type of the column name selected in the first layer. The two-layer group is copied several times because the SQL query can contain multiple conditions.  \\n\\nConstraint Reasoning Embedded Structured Prediction', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633686947292, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 13, 'chunk_text': '# 6.2 If-then Program Synthesis\\nDatasets and Metrics. The data sets for this experiment are crawled from the IFTTT and Zapier websites. $^{5,6}$ The statistics of the two data sets are shown in Table 1. The IFTTT data set contains more data samples than the Zapier data set, while the dimensions of the four labels in the Zapier data set are several times larger than those of the IFTTT data set. The sentences in the data set are tokenized by the Spacy library.  \\n\\nTo evaluate the performance of different models on this data set, we consider two metrics: the percentage of valid if-then programs, and accuracy. A program is considered valid if it satisfies our defined Functionality constraints. The accuracy metric is the percentage of predicted programs that match exactly in all four fields with those in the test set. This metric shows the percentage of correctly predicted programs.  \\n\\n<html><body><table><tr><td>Dataset</td><td>#train set</td><td>#val set</td><td>#test s set</td><td>#quadruple</td><td>#vocabulary</td></tr><tr><td>IFTTT</td><td>66761</td><td>4148</td><td>2640</td><td>(111, 443, 88, 161)</td><td>4000</td></tr><tr><td>Zapier</td><td>24454</td><td>4809</td><td>2576</td><td>(1353, 1755, 1333, 1466)</td><td>3782</td></tr></table></body></html>  \\n\\n  \\nTable 1: The statistics for the IFTTT and Zapier data sets.   \\nFigure 15: Percentage of valid programs (left column) and MDD memory consumption (right column) on IFTTT and Zapier data sets. Core-Sp outperforms the state-of-the-art approach LatentAttention (Liu et al., 2016) in generating valid if-then programs. The percentages of valid programs generated by Core-Sp embedding MDDs with different widths are shown for the IFTTT (top left) and Zapier (bottom left) data sets. Core-Sp model that embeds the exact MDD produces $100\\\\%$ valid programs on the two data sets. The relaxed and exact MDD for the IFTTT data set takes less than 4 MB and for the Zaiper data set takes less than $20\\\\ \\\\mathrm{MB}$ memory space.  \\n\\nValid Programs Comparison. Core-Sp significantly boosts the percentage of valid programs generated. In this experiment, we start with evaluating the percentage of valid programs generated from the state-of-the-art LatentAttention model without the CoreSp module. Then we apply the Core-Sp module from Algorithm 1, which iteratively increases the width of the relaxed MDD until we arrive at the exact MDD. Figure 15 shows the performance of all the relaxed and the exact Core-Sp modules when added to the LatentAttention model. Among all programs produced by the LatentAttention model without the Core-Sp layer, around $88\\\\%$ of them are valid on the two data sets. Once we enforce the exact Core-Sp capturing the Functionality constraint, all the programs (100%) produced are valid. We also study the effect of restricting the maximum layer width of the MDDs used in Algorithm 1. We evaluate Core-Sp with MDDs of width-2 up to the largest width, which is width-111 for IFTTT and width-1353 for Zapier. The percentage of valid programs on a separate testing set is shown in the blue lines. The performance of the relaxed Core-Sp increases gradually with the increase of the MDD width.  \\n\\n  \\nFigure 16: The Core-Sp module (red lin brings approximately $1\\\\mathrm{-}2\\\\%$ increase in accuracy for the IFTTT data set and 2% increase for the Zapier data set for the if-then program synthesis task. The LatentAttention model (blue line) is the previous state-of-the-art, which cannot guarantee the validity of the programs generated.  \\n\\nAccuracy Comparison. Figure 16 compares the training set and testing set accuracy for the state-of-the-art LatentAttention model and Core-Sp as the training progresses.  \\n\\n<html><body><table><tr><td></td><td colspan=\"3\">IFTTT</td><td colspan=\"3\">Zapier</td></tr><tr><td>Methods</td><td>Width</td><td>Accuracy</td><td>Valid (%)</td><td>Width</td><td>Accuracy</td><td>Valid 1 (%)</td></tr><tr><td>LatentAttention</td><td>N/A</td><td>42.17%</td><td>87.51%</td><td>N/A</td><td>31.74%</td><td>88.00%</td></tr><tr><td>BestrelaxedCoRE-SP</td><td>80</td><td>44.12%</td><td>99.19%</td><td>1200</td><td>34.28%</td><td>99.53%</td></tr><tr><td>Exact CORE-SP</td><td>111</td><td>43.07%</td><td>100%</td><td>1353</td><td>32.83%</td><td>100%</td></tr></table></body></html>  \\n\\nTable 2: The relaxed and exact Core-Sp modules boost the percentage of valid programs generated and the accuracy for the if-then program synthesis task on both the IFTTT and the Zapier data sets. Exact Core-Sp produces $100\\\\%$ valid programs while Core-Sp with the best relaxed MDD produced by Algorithm 1 leads to the best accuracy in the prediction and close to $100\\\\%$ valid programs.  \\n\\nWe also collect the results of the LatentAttention model without Core-Sp , the model with the best relaxed Core-Sp model (in terms of accuracy) and with the exact Core-Sp model on the two data sets in Table 2. The best relaxed Core-Sp model ves $1-2\\\\%$ higher accuracy than the LatentAttention model and still generates around 11% more valid programs than the LatentAttention model. Similarly, the model with the exact Core-Sp module improves approximately $1\\\\%$ in accuracy but generates $100\\\\%$ valid programs.\\n\\n# 6.3 SQL Query Generation from Natural Language\\nDataset and Metrics. We conduct experiments on the large-scale WikiSQL data set (Zhong et al., 2017), which contains 80 ,654 examples of questions and SQL queries distributed across 24 ,241 tables from Wikipedia. We observe that most of the SQL queries are not complex. Therefore, we further select queries within the data set to form a moderate and a hard test set. The moderate test set consists of those queries containing at least one conditional statement ( i.e. , “ cond-col cond-op cond-val ”). The hard test set is composed of those queries that have at least two conditional statements.  \\n\\nThe metrics applied for this task are: 1) Percentage of valid SQL queries, i.e. , generated queries that satisfy the datatype constraint. 2) Execution accuracy. A generated query is considered correct if the returned value of executing the generated SQL query matches the returned value from the ground truth query. 3) Logical accuracy, which evaluates the percentage of the generated queries that match exactly the ground truth queries in every field. The implementation is based on SQLNova. We use the BERT-base model (Devlin et al., 2019) as the word embedding. The entire model takes up to 3 days to train for 50 epochs. We choose the model that achieves the best execution accuracy on the validation data set for both the baseline and Core-Sp and calculate the corresponding statistics reflected in Table 3.  \\n\\nValid SQL Queries Comparison. As shown in Table 3, SQLNova with the Core-Sp module embedded generates $100\\\\%$ valid SQL programs, demonstrating 0.7% improvement over the original SQLNova model on the full testing set. On the moderate testing set, the improvement increases to $5.7\\\\%$ . On the most difficult hard testing set, the improvement becomes $16.3\\\\%$ . Due to the fact that a majority of the SQL queries in the full test set have empty value at cond-op and $=$ value at sel-op , SQLNova has a high probability to predict  \\n\\nJiang, Zhang, Hoeve and Xue  \\n\\n<html><body><table><tr><td>Accuracy</td><td colspan=\"2\">Full test set</td><td colspan=\"2\">Moderate test set</td><td colspan=\"2\">Hard test set</td></tr><tr><td>per component</td><td>SQLNova</td><td>CORE-SP</td><td>SQLNova</td><td>CORE-SP</td><td>SQLNova</td><td>CORE-SP</td></tr><tr><td>sel-col</td><td>96.3%</td><td>96.3%</td><td>96.4%</td><td>97.0%</td><td>96.6%</td><td>97.7%</td></tr><tr><td>agg-op</td><td>89.8%</td><td>89.7%</td><td>75.7%</td><td>77.8%</td><td>75.4%</td><td>75.8%</td></tr><tr><td>#WHERE</td><td>98.1%</td><td>97.9%</td><td>98.5%</td><td>98.6%</td><td>98.9%</td><td>98.5%</td></tr><tr><td>cond-col</td><td>93.6%</td><td>93.6%</td><td>94.0%</td><td>93.8%</td><td>93.6%</td><td>93.7%</td></tr><tr><td>cond-op</td><td>96.7%</td><td>96.9%</td><td>89.8%</td><td>91.6%</td><td>84.8%</td><td>87.9%</td></tr><tr><td>where-val-idx</td><td>94.5%</td><td>94.8%</td><td>89.4%</td><td>92.3%</td><td>86.7%</td><td>87.5%</td></tr><tr><td>where-val</td><td>94.7%</td><td>94.9%</td><td>89.3%</td><td>92.2%</td><td>86.4$</td><td>87.1%</td></tr><tr><td rowspan=\"2\">Overall Accuracy</td><td colspan=\"2\">Full test set</td><td colspan=\"2\">Moderate test set</td><td colspan=\"2\">Hard test set</td></tr><tr><td>SQLNova</td><td>CORE-SP</td><td>SQLNova</td><td>CORE-SP</td><td>SQLNova</td><td>CORE-SP</td></tr><tr><td>Logical Accuracy</td><td>79.3%</td><td>79.9%</td><td>61.6%</td><td>65.8%</td><td>58.3%</td><td>62.5%</td></tr><tr><td>Execution Accuracy</td><td>85.5%</td><td>86.1%</td><td>75.4%</td><td>79.1%</td><td>76.1%</td><td>78.0%</td></tr><tr><td>Valid SQL</td><td>99.3%</td><td>100.0%</td><td>94.3%</td><td>100%</td><td>83.7%</td><td>100%</td></tr></table></body></html>  \\n\\nTable 3: Core-Sp outperforms the previous state-of-the-art SQLNova on three testing sets in SQL query generation. Core-Sp leads to $100\\\\%$ valid SQL queries generated and increases in both the execution accuracy and the logical accuracy compared with SQLNova for the Text2SQL generation task. The top table shows the accuracy of predicting each field in the SQL queries for both models.  \\n\\nprevalent labels in the data set and coincidentally satisfies the SQL grammar. This is the main reason that our relative improvement is not significant for the full test set.  \\n\\nExecution and Logical Accuracy. Figure 17 compares SQLNova and the exact CoreSp model over execution and logical accuracy metrics as the training progresses. We also collect the accuracy of predicting each field in the SQL queries as shown in the table (top) of Table 3. The execution and logical accuracy are shown at the bottom of Table 3. CoreSp gains improvement for predicting sel-col ,cond-op ,where-val-idx and where-val components. For the other components in the SQL queries, the difference in accuracy between Core-Sp and SQLNova is less than $0.4\\\\%$ . In terms of the execution accuracy, the exact Core-Sp is higher than SQLNova by $0.6\\\\%$ ,$3.7\\\\%$ and $1.9\\\\%$ on the full, moderate, and hard test sets, respectively. In terms of the logical accuracy, the exact Core-Sp is higher than SQLNova by 0 .6%, 4 .2%, and $4.2\\\\%$ for the three testing sets. The improvement in the execution and logical accuracy is due to the fact that the Core-Sp module removes invalid operators during SQL generation and as a consequence reduces the modeling space.', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846633719453150, 'paper_id': '64a29654d68f896efa29af31', 'paper_title': 'Constraint Reasoning Embedded Structured Prediction.', 'chunk_id': 14, 'chunk_text': '# 7. Conclusion\\nIn this work, we proposed Core-Sp , an end-to-end neural module that embeds constraint reasoning into machine learning for structured prediction problems. Core-Sp represents the constraints using decision diagrams and filters out invalid solutions. Core-Sp is then embedded into a neural network which can be trained in an end-to-end fashion. We demonstrate the effectiveness of Core-Sp on three structured prediction applications including vehicle dispatching service planning, if-then program synthesis, and Text2SQL generation. We also propose an iterative search algorithm to find the optimal decision diagram structure for these applications. We show that the Core-Sp module improves constraint satisfaction in all three applications. In addition, Core-Sp reduces the modeling space. As a consequence, neural networks with Core-Sp embedded learn faster and generalize better than the pure neural network models. For future work, we plan to generalize Core-Sp in continuous domains and in reinforcement learning.  \\n\\n  \\nFigure 17: The execution accuracy (left) and logical accuracy (right) over training iterations for both Core-Sp and SQLNova. Core-Sp leads to higher execution and logical accuracy throughout the training iterations.\\n\\n# Acknowledgments\\nWe thank all the reviewers for their constructive comments. This research was supported by NSF grants IIS-1850243, CCF-1918327, CCF-1918102, and Office of Naval Research Grant No. N00014-21-1-2240. M. Z. completed this work when he was a master’s student at Purdue University.', 'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}, {'id': 454846943844734304, 'paper_id': '6584feac939a5f4082397b62', 'paper_title': 'Text2Analysis: A Benchmark of Table Question Answering with Advanced Data Analysis and Unclear Queries', 'chunk_id': 0, 'chunk_text': '# Text2Analysis: A Benchmark of Table Question Answering with Advanced Data Analysis and Unclear Queries\\nXinyi $\\\\mathbf{H}\\\\mathbf{e}^{1*}$ , Mengyu Zhou 2† , Xinrun $\\\\mathbf{X}\\\\mathbf{u}^{3*}$ , Xiaojun $\\\\mathbf{M}\\\\mathbf{a}^{2}$ , Rui $\\\\mathbf{Ding}^{2}$ , Lun $\\\\mathbf{D}\\\\mathbf{u}^{2}$ ,Yan $\\\\mathbf{Gao}^{2}$ , Ran $\\\\mathbf{Jia}^{2}$ , Xu Chen 2 , Shi $\\\\mathbf{H}\\\\mathbf{a}\\\\mathbf{n}^{2}$ , Zejian Yuan 1 , Dongmei Zhang 2  \\n\\n1 Xi’an Jiaotong University, 2 Microsoft, 3 Institute of Software Chinese Academy of Science hxyhxy $@$ stu.xjtu.edu.cn, xuxinrun $20\\\\,\\\\@$ mails.ucas.ac.cn, yuan.ze.jian $@$ xjtu.edu.cn, {mezho, xiaojunma, juding, lun.du, gaoya, raji, xu.chen, shihan,dongmeiz }@microsoft.com\\n\\n# Abstract\\nTabular data analysis is crucial in various fields, and large language models show promise in this area. However, current research mostly focuses on rudimentary tasks like Text2SQL and TableQA, neglecting advanced analysis like forecasting and chart generation. To address this gap, we developed the Text2Analysis benchmark, incorporating advanced analysis tasks that go beyond the SQL-compatible operations and require more in-depth analysis. We also develop five innovative and effective annotation methods, harnessing the capabilities of large language models to enhance data quality and quantity. Additionally, we include unclear queries that resemble real-world user questions to test how well models can understand and tackle such challenges. Finally, we collect 2249 query-result pairs with 347 tables. We evaluate five state-ofthe-art models using three different metrics and the results show that our benchmark presents introduces considerable challenge in the field of tabular data analysis, paving the way for more advanced research opportunities.', 'original_filename': 'Conf_Paper_Meta_Data_AAAI2024_with_whole_text.db'}, {'id': 454846943883007330, 'paper_id': '6584feac939a5f4082397b62', 'paper_title': 'Text2Analysis: A Benchmark of Table Question Answering with Advanced Data Analysis and Unclear Queries', 'chunk_id': 1, 'chunk_text': '# 1 Introduction\\nTabular data analysis plays a crucial role in various fields, and automated data analysis has the potential to enhance people’s work efficiency significantly (Delen and Ram 2018a). The emergence of large language models has shown promising capabilities to accelerate tabular data analysis (Chen 2023; Ye et al. 2023; Ma et al. 2023; Jiang et al. 2023). Understanding the analytical abilities of these models, identifying the analysis processes they can replace, and determining the analysis steps they can assist with have become pressing questions in the field.  \\n\\nExisting research on tabular data analysis has limited coverage of data analysis. As shown in Figure 2, data analysis can be divided into descriptive, diagnostic, predictive, and prescriptive analytics (Delen and Ram 2018b). The existing Text2SQL and TableQA datasets (Dong and Lapata 2016; Katsogiannis-Meimarakis and Koutrika 2021) focus primarily on rudimentary operations that are part of descriptive analytics and can be mostly solved by SQL and OLAP operations. They pay limited attention to advanced analysis (see $\\\\S2.1\\\\rangle$ ) that require advanced operations and visualizations beyond rudimentary operations, such as calculating insights, forecasting, and chart generation (see examples in Figure 1).  \\n\\n  \\nFigure 1: Examples of Text2Analysis Benchmark.  \\n\\nIn the real world, many user queries are often described in unclear ways (Wang et al. 2023). When solving advanced or complex data analysis tasks with a large set of available tools and APIs, it is hardly the case that a user could write clear instructions with complete intent and parameters. As we will discuss in $\\\\S2.2$ , the most common “unclear query” type is missing parameters for analysis tasks. E.g. , the query “Help me create a chart to visualize the sales for BMW, Compat, BMW 3-Serie” does not explicitly specify the chart type to be drawn or the field to be mapped to the $\\\\mathbf{X}$ axis. Responding accurately to these queries not only demands the semantic parsing abilities of large language models but also requires them to possess strong data analysis capabilities to recommend intent beyond the query.  \\n\\nIn this paper, we propose the Text2Analysis benchmark which expands beyond rudimentary operations and clear instructions. The benchmark incorporates unclear queries that involve advanced data analysis. Similar to Text2SQL datasets, in Text2Analysis the input is the (table, query) pair, and the output is the (code, result) pair. The ground-truth code only leverages a set of chosen data analysis APIs / operations from public and customized Python libraries such as Pandas, Prophet and Matplotlib.  \\n\\nCollecting the dataset is a difficult task because each sample in the Text2Aanlysis dataset simultaneously contains a table, query, Python code, and result. It requires annotators with related expert backgrounds and would consume a lot of time. To accelerate the annotation process and increase the volume of annotated data, we have developed five innovative and reliable annotation methods. Those methods make full use of large language models to perform forward annotations, expansion with new tables, and expansion unclear queries. Meanwhile, some methods also collect data from the output, such as reverse generation from codes or results. We collect 2249 (query, code, result) pairs with 347 tables. To ensure annotation quality, iterative annotation and human evaluation are employed. Their results and dataset distribution indicate that Text2Analysis has a diverse, high-quality data analysis dataset.  \\n\\nDue to the numerous tasks involved in the problem and the outputs consisting of both code and results, evaluating the generated solutions with appropriate metrics poses a challenge. We have selected three metrics to evaluate from different perspectives: executable code ratio, pass rate, and regression metrics. The executable code ratio evaluates the model’s ability to generate executable code. Pass rate evaluates the correctness of the generated code. Regression scores measure the predicting capability of the chosen model within the generated code.  \\n\\nFurthermore, we provide an evaluation of five current state-of-the-art models, including GPT family models, code generation models, and tabular models. We evaluate their performance in handling advanced analysis and unclear queries. Our experiment indicates that large language models exhibit robust parsing and code generation aptitudes for data analysis in the context of clear queries. However, they grapple with complex libraries and unclear queries. To augment their efficacy, future research can concentrate on bolstering the capacity to recommend fields for sophisticated analyses and tackling complex operations such as operations with complex parameter input and model training.  \\n\\nIn summary, our main contributions are:  \\n\\n• We create the Text2Analysis benchmark which includes advanced analysis tasks and unclear queries that were rarely addressed in previous research work. The dataset and code will be open-sourced on  \\n\\n  \\nFigure 2: Advanced Analysis consists of Advanced Operations and Visualizations that are not covered by Rudimentary Operations across descriptive, diagnostic, predictive, and prescriptive analytics.  \\n\\nhttps://github.com/microsoft/Text2Analysis. • We propose five innovative and reliable annotation methods for the construction of NL2Code datasets. They utilize large language models to accelerate the annotation process and increase the volume of annotation. • The performance of the baseline models is systematically evaluated against our Text2Analysis benchmark. Our experiments show the challenges to be solved in the future to satisfy real-world table analysis needs.', 'original_filename': 'Conf_Paper_Meta_Data_AAAI2024_with_whole_text.db'}, {'id': 454846943921018212, 'paper_id': '6584feac939a5f4082397b62', 'paper_title': 'Text2Analysis: A Benchmark of Table Question Answering with Advanced Data Analysis and Unclear Queries', 'chunk_id': 2, 'chunk_text': '# 2 Problem Definition\\nWe introduce the Text2Analysis problem as follows: $(t a b l e,\\\\ q u e r y)\\\\to(c o d e,\\\\ r e s u l t)$ . The input consists of a table and a user query . The output consists of Python code snippet(s) and the corresponding result (s). A table has n fields $T\\\\ =\\\\ (f_{1},...,f_{n})$ , and each field consists of a field header and field values. A query is related to data analysis, particularly focusing on advanced analysis $(\\\\S2.1)$ that addresses the shortcomings of existing work and presents a greater and more difficult challenge for models. Additionally, it includes unclear queries $(\\\\S2.2)$ , which are often found in real-world user scenarios and can more effectively evaluate the model’s analytical capabilities.\\n\\n# 2.1 Analysis Operations and Tasks\\nText2Analysis expands the data analysis dataset to advanced analysis tasks. As shown in Figure 2, Data analysis can be divided into descriptive (what happened?), diagnostic (why did it happen?), predictive(what will happen), and prescriptive analytics (what should I do?) (Delen and Ram 2018b). And reporting and visualization may follow each type of analytics. Existing research on table-based data analysis tasks, such as TableQA and Text2SQL (Dong and Lapata 2016; Katsogiannis-Meimarakis and Koutrika 2021), has focused mainly on part of descriptive analytics that can be solved by SQL. They pay insufficient attention to advanced analysis that are beyond the rudimentary operations and require more in-depth analysis.  \\n\\nThe advanced analysis portion of Text2Analysis selects representative tasks from each type of analytics to form the dataset. From descriptive and diagnostic analytics, basic insights are chosen. From predictive analytics, forecasting is selected. And from reporting and visualization, chart generation is chosen. A more detailed introduction to each task will be provided after the following paragraph.  \\n\\nAdvanced analysis, along with rudimentary operations, form the Text2Analysis dataset. They can be combined to form a complex analysis. rudimentary operations and advanced operations (tasks in advanced analysis that output data such as tables and values, that is, tasks excluding reporting and visualization) can be interconnected, and reporting and visualization can be performed subsequently for display.  \\n\\nWe introduce the involved tasks one by one as follows:  \\n\\n1. Rudimentary Operations : These operations encompass a set of functions and procedures that can be executed using the Structured Query Language (SQL) (Date 1989). Their primary purpose is to enable users to perform data management, manipulation, and transformation on multidimensional structured data. The main operations include group by, aggregation, filter, sort, and so on.  \\n\\n2. Basic Insights : In the context of a multi-dimensional dataset, an insight represents an interesting observation about a particular subject from a specific perspective (Ding et al. 2019; Ma et al. 2021; Chen, Yang, and Ribarsky 2009). Text2Analysis incorporates seven commonly insights:  \\n\\n• Rank: Within a group comprising multiple values, the highest value significantly exceeds all other values.  \\n\\n• RankLast: Within a group comprising multiple values, the lowest value is notably smaller than all other values.  \\n\\n• Attribution: In a group of multiple non-negative values, the highest value is equal to or larger than the sum of all other values.  \\n\\n• Trend: A time series (segment) exhibits an increasing or decreasing trend.  \\n\\n• Monotonicity: A time series (segment of) exhibits a consistent and unidirectional increasing or decreasing trend.  \\n\\n• Outlier: A time series contains outliers, which deviate significantly from the trend compared to the majority of points and their neighbors.  \\n\\n• Unimodality: A (segment of) time series exhibits an unimodal distribution, characterized by a single peak or turning point, and may display U-shaped patterns.  \\n\\n3. Forecasting : Forecasting involves predicting future trends and outcomes by analyzing historical data using statistical methods, machine learning algorithms, and time series models (Taylor and Letham 2018; Hosseini et al. 2021). This process identifies patterns and relationships within the data, enabling informed predictions about future events.  \\n\\n4. Chart Generation : Chart generation refers to the recommendation and construction of prevalent charts derived from a given table (Moritz et al. 2019; Luo et al. 2018; Zhou et al. 2021).  \\n\\nWe choose commonly used Python libraries for each task as follows, to address the corresponding analysis query:  \\n\\n• Rudimentary Operations: Pandas 1 (APIs excluding plot$t i n g^{2},$ ).  \\n\\n• Each task of Basic Insights: Custom functions are implemented to perform the mentioned tasks, and provide results for evaluation.  \\n\\n• Forecasting: Greykite 3 (Forecaster ), Prophet 4 (Prophet ).   \\n• Chart Generation: Matplotlib 5 (pyplot ).\\n\\n# 2.2 Unclear Queries\\nIn many real situations, users do not directly provide complete queries, but rather give queries with some unclear intents. There are various ways to address them, such as recommending completions for the missing intents or guiding users to complete the query. This paper focuses on proposing a benchmark and does not explore the solution methods in depth. We only use the model for recommendations, which can also satisfy the exploration of the next purpose.  \\n\\nSecondly, the analysis and recommendation capabilities of large language models can be explored through unclear queries. When recommending for unclear queries, the model not only needs to possess semantic parsing capabilities but also requires analytical recommendation capabilities. Exploring these capabilities of large language models is crucial for better utilizing them in the analytical domain.  \\n\\nAn unclear query lacks the essential information required to perform tasks. In other words, in the query, there are missing parameters for generating the analysis code which consists of operations from the chosen libraries. Since the same task may require different parameters in different libraries, we have combined the representatively used libraries for each task in $\\\\S2.1$ and selected the essential parameters as shown in Table 1. Some parameters are not provided for missing parameters as follows:  \\n\\n• When a parameter is absent, the associated operator will be excluded from use. The parameters for this scenarios are dimension field for rudimentary operations, filter condition for rudimentary operations, insight type for basic insights.  \\n\\n• Parameters are typically not mentioned in the query or possess standard default values. The parameters for similar scenarios are confidence for forecasting, $p$ -value for basic insights, measure aggregation for basic insights.  \\n\\nIn addition to missing parameters, there are other types of unclear queries, such as, ambiguous parameters, unclear tasks. For ambiguous parameters, a query may have all parameters provided, but they are ambiguous or vague. E.g. , a table has two fields, UnitPrice and TotalPrice, but the query only mentions “price”, resulting in ambiguity. Another example is when a query mentions filtering “young people”, but there is no universally accepted definition of “young”, leading to varied age filters. There are more details in (Wang et al. 2023), and we will not discuss this further in this paper.  \\n\\nTable 1: Taxonomy and Examples of Unclear Queries   \\n\\n\\n<html><body><table><tr><td>Tasks</td><td>Parameters</td><td>Meanings of Parameters and Missing Parameters Query</td></tr><tr><td rowspan=\"3\">Rudimentary Operations</td><td>clear</td><td>E.g., Which brand has the highest total sales in 2023?</td></tr><tr><td>field (msr_field)</td><td>Measure field for sort or aggregation. E.g., Which brand had the best overall in 2023?</td></tr><tr><td>agg (agg-func)</td><td>Aggregation function, such as sum, average... E.g., Which brand has the highest sales in 2023?</td></tr><tr><td rowspan=\"2\">Basic Insights</td><td>clear</td><td>E.g., Does total increase over time?</td></tr><tr><td>field</td><td>Field for the insight. E.g., Is there an increase over time?</td></tr><tr><td rowspan=\"3\">Forecasting</td><td>clear</td><td>E.g., Forecast the cost data of different brands, categories and models of cars in 2012.</td></tr><tr><td>forecast field</td><td>Measure field used for forecasting. E.g., What will be for different categories and models of cars in 2012?</td></tr><tr><td>steps / freq</td><td>Forecasting steps and/or frequency. E.g., What will be the sales and cost data of different brands, categories and models of cars?</td></tr><tr><td rowspan=\"4\">Visualization</td><td>clear</td><td>E.g., Help me create a bar chart to visualize the Frequency field for the HH field.</td></tr><tr><td>chart type</td><td>Char type, including lineChart, barChart, scatterChart, pieChart. E.g., Help me create a chart to visualize the Frequency field for the HH field.</td></tr><tr><td>x fields</td><td>Fields for x-axis. E.g., Help me create a bar chart to visualize the Frequency field.</td></tr><tr><td>y fields</td><td>Fields for y-axis. E.g., Help me create a bar chart to visualize for the HH field.</td></tr></table></body></html>  \\n\\nFor unclear tasks, a query does not explicitly specify what task to use for analysis, e.g. , “What should I do if I want to get more profits”. This query only proposes a goal without specifying any tasks, and solving such problems requires stronger problem-solving abilities. In this work, we will not discuss this further and will consider it as future work.', 'original_filename': 'Conf_Paper_Meta_Data_AAAI2024_with_whole_text.db'}, {'id': 454846943997039976, 'paper_id': '6584feac939a5f4082397b62', 'paper_title': 'Text2Analysis: A Benchmark of Table Question Answering with Advanced Data Analysis and Unclear Queries', 'chunk_id': 4, 'chunk_text': '# 3.3 Data Statistics and Distribution\\nText2Analysis encompasses a total of 2249 (table, query, code, result )pairs, sourced from 347 distinct tables. Queries of Text2Analysis encompass a variety of tasks, as demonstrated in Figure 4. And they encompass a diversity of unclear queries, as demonstrated in Figure 5. Those figures highlight the distribution of queries and code and further showcase the diversity of the dataset and the difficulty of the problem.  \\n\\n  \\n  \\nFigure 4: Analysis Task Distribution of All Queries.  \\n\\n  \\nFigure 5: Task & Parameter Distribution of Unclear Queries.\\n\\n# 4 Evaluation Methodology\\n\\n# 4.1 Baselines\\nWe evaluate the performance of three types models namely GPT family models, code Generation models and tabular models on Text2Analysis:  \\n\\nGPT family models : GPT-4 models (OpenAI 2023) are potent large-scale language models with the ability to generate human-like text and high-quality code. They perform a wide range NLP tasks well with zero or few shots.  \\n\\nCode Generation models : StarChat$\\\\cdot\\\\alpha/\\\\beta$ (Tunstall et al. 2023) and CodeGen2.5 (Nijkamp et al. 2023) are language models specifically designed to serve as effective coding assistants, providing valuable support to programmers.  \\n\\nStarChat$\\\\cdot\\\\alpha$ /StarChat$\\\\beta$ , derived from the StarCoder(Li et al. 2023) family, are fine-tuned language models with 15.5 billion parameters, adept at aiding programmers across ${80+}$ programming languages. Unlike original StarCoder models that focused on code completion, StarChat versions are better suited for Text2Analysis tasks that require query instructions and task explanations.  \\n\\nCodeGen2.5, an autoregressive language model built upon CodeGen2. The model is trained on StarCoderData for 1.4T tokens, achieving competitive results compared to StarCoderBase-15.5B with less than half the size.  \\n\\nTabular models : TAPEX(Liu et al. 2022) (Table Pretraining via Execution) is a straightforward yet highly effective pre-training method designed to enhance existing models with table reasoning capabilities. It achieves stateof-the-art performance on TableQA, Text2SQL and TabFact datasets such as WikiSQL(Zhong, Xiong, and Socher 2017).', 'original_filename': 'Conf_Paper_Meta_Data_AAAI2024_with_whole_text.db'}, {'id': 454846944082498924, 'paper_id': '6584feac939a5f4082397b62', 'paper_title': 'Text2Analysis: A Benchmark of Table Question Answering with Advanced Data Analysis and Unclear Queries', 'chunk_id': 6, 'chunk_text': '# 5 Experiments\\nWe conduct experiments on the five baselines introduced in $\\\\S4.1$ . For the GPT family and code generation models, we design instruction prompts that include the HTML table, constraints on code generation libraries, requirements for result formatting, and so on. The parameter details for each model in the experiment are as follows:  \\n\\n• GPT-4: model: gpt-4-32k, temperature: 0, maximum length: 4096.   \\n• StarChat$\\\\alpha$ : model: starchat-alpha 7 , temperature: 0.2, max new tokens: 1024.   \\n• StarChat$\\\\beta$ : model: starchat-beta 8 , temperature: 0.2,  \\n\\n7 https://huggingface.co/HuggingFaceH4/starchat-alpha   \\n8 https://huggingface.co/HuggingFaceH4/starchat-beta\\n\\n# max new tokens: 1024.\\n• CodeGen2.5: model: codegen25-7b-instruct 9 , temperature: 0.2, max new tokens: 1024.  \\n\\n• TAPEX: model: tapex-large-finetuned-wtq10.\\n\\n# 5.1 Main Results\\nAs shown in Table 2, overall experimental results demonstrate that GPT-4 outperforms other models. It achieves the highest ECR on the majority of tasks and the highest pass rate across all tasks. GPT-4’s relatively better performance can be attributed to its code generation capabilities and context learning abilities. The former allows it to generate more accurate and executable code. The latter enables it to better understand and integrate the given query and instructions.  \\n\\nCode generation models exhibit overall performance that is comparable to GPT-4 in generating executable code. However, the pass rate of the generated code is relatively low, with the overall pass rate being $24.86\\\\%$ lower than that of GPT-4. This can be attributed to the limited in-context learning capabilities of these models, which results in a restricted ability to capture the meaning of the given query and generate the correct code accordingly.  \\n\\nThe tabular model is currently only capable of completing rudimentary tasks. Their performance on the Text2Analysis benchmark is subpar, with an pass rate of only $11.55\\\\%$ . One reason for this is that rudimentary tasks in the benchmark involve complex pivot operations and calculations, which existing tabular models struggle with. These models excel at querying tables to find values in the original table but falter when it comes to performing complex calculations.  \\n\\n<html><body><table><tr><td>Model</td><td>CORR ↑</td><td>RMSE ↓</td><td>MAE←</td><td>MedAE ↓</td></tr><tr><td>GPT4</td><td>0.10</td><td>0.27</td><td>0.24</td><td>0.27</td></tr><tr><td>StarChat-β</td><td>0.16</td><td>0.76</td><td>0.76</td><td>0.73</td></tr></table></body></html>  \\n\\nTable 3: Regression Scores for Forecasting. We show the models with the highest ECR and pass means that smaller is better. CORR is better when its ab$@1$ in Table 2. ↓solute value is closer to 1. Thus when ${\\\\mathrm{CORR}}\\\\in[0,\\\\,1]$ , larger is better.  \\n\\nTable 3 presents the regression scores for the forecasting task, demonstrating that the code generated by the baselines has limited prediction capabilities. To successfully tackle forecasting task, not only do the models need to generate correct code, but they also need to select appropriate prediction models. The current baseline models fall short in these aspects, highlighting the need for further development and research to improve their performance on forecasting tasks.  \\n\\nThe overall performance of the baseline models leaves room for improvement. Its best performance is on rudimentary operations tasks, with an pass rate of only $57\\\\%$ . Moreover, on complicated forecasting, it can only reach an pass rate of $14\\\\%$ . This highlights the significant exploration space that still exists, presenting opportunities for further research and the development of more advanced models.\\n\\n# 5.2 Unclear Queries Results\\nFigure 6: ECR for Unclear Queries on GPT-4.  \\n\\n  \\nFigure 7: Pass rate for Unclear Queries on GPT-4.  \\n\\nWhen facing clear queries, large models have strong parsing and code generation capabilities for data analysis. As shown in Figure 7, especially when the query is clear, the pass rate of chart generation is as high as $86\\\\%$ . When facing a clear query, the large model needs to first parse the natural language into corresponding tasks and parameters, and then generate the correct code.  \\n\\nThe ability to recommend fields for advanced data analysis tasks, particularly measure fields (columns with numerical attributes in a table), can be enhanced in large language models. As shown in Figure 6, the ECR decreases by $8\\\\%$ on the basic insights task when the field is missing. As shown in Figure 7, when the measure field is missing, the pass rate has decreased for most tasks, especially the chart generation task, which has decreased by $25\\\\%$ . If we want to improve the recommendation analysis, future work needs to consider injecting the knowledge of recommending analytical columns into the large language models.  \\n\\nThe code generation capability for more complex libraries needs to be enhanced. As shown in Figure 6 and Figure 7, both the ECR and pass rate for forecasting tasks are below $1\\\\%$ . The forecasting task library involves more complex operations such as parameter input and model training. In more than $50\\\\%$ of the cases, GPT-4 generates incorrect parameters or input parameters that are not included in the operations. Additionally, in some instances, the code does not select the correct model, rendering it unable to successfully fit the data.\\n\\n# 6 Related Work\\n\\n# 6.1 Tabular Benchmark\\nTableQA and Text2SQL are prevalent tasks in tabular data analysis. These tasks entail answering user queries based on the information present in a source table. Notable their datasets include WikiTableQuestions (Pasupat and Liang 2015), WikiSQL (Zhong, Xiong, and Socher 2017) and so on. Although numerous related datasets encompass a wide variety of table types, the primary focus remains on descriptive data analysis. Text2Analysis dataset expands to more tabular analysis tasks.  \\n\\nTo address tabular tasks, pre-trained models like TAPAS (Herzig et al. 2020), TAPEX(Liu et al. 2022), etc. have been employed. Concurrently, large language models have also been used in approaches like DATER (Ye et al. 2023), StructGPT (Jiang et al. 2023), etc. In this work, we evaluate SOTA tabular models as comparison baselines.', 'original_filename': 'Conf_Paper_Meta_Data_AAAI2024_with_whole_text.db'}]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# Test query_by_chunk_contain API\n",
    "print(\"\\nTesting /query_by_chunk_contain API...\")\n",
    "chunk_params = {\"chunk\": \"Text2SQL\", \"top_k\": 15}\n",
    "call_api(\"query_by_chunk_contain\", params=chunk_params)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "a5cbdb23-1018-4bca-bfcf-30ec8b98e576",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'id': 454846633242875330,\n",
       "  'paper_id': '64a29654d68f896efa29af31',\n",
       "  'paper_title': 'Constraint Reasoning Embedded Structured Prediction.',\n",
       "  'chunk_id': 0,\n",
       "  'chunk_text': '# Constraint Reasoning Embedded Structured Prediction\\nNan Jiang   \\nDepartment of Computer Science Purdue University   \\nWest Lafayette, Indiana, USA.  \\n\\n Maosen Zhang ByteDance Beijing, China.  \\n\\n Willem-Jan van Hoeve Tepper School of Business Carnegie Mellon University Pittsburgh, Pennsylvania, USA.  \\n\\n  \\n\\nYexiang Xue   \\nDepartment of Computer Science Purdue University   \\nWest Lafayette, Indiana, USA.  \\n\\n Editor: Maya Gupta\\n\\n# Abstract\\nMany real-world structured prediction problems need machine learning to capture data distribution and constraint reasoning to ensure structure validity. Nevertheless, constrained structured prediction is still limited in real-world applications because of the lack of tools to bridge constraint satisfaction and machine learning. In this paper, we propose CO nstraint RE asoning embedded Structured Prediction ( Core-Sp ), a scalable constraint reasoning and machine learning integrated approach for learning over structured domains. We propose to embed decision diagrams, a popular constraint reasoning tool, as a fullydifferentiable module into deep neural networks for structured prediction. We also propose an iterative search algorithm to automate the searching process of the best Core-Sp structure. We evaluate Core-Sp on three applications: vehicle dispatching service planning, if-then program synthesis, and text2SQL generation. The proposed Core-Sp module demonstrates superior performance over state-of-the-art approaches in all three applications. The structures generated with Core-Sp satisfy 100% of the constraints when using exact decision diagrams. In addition, Core-Sp boosts learning performance by reducing the modeling space via constraint satisfaction.  \\n\\nKeywords: Constraint Reasoning, Decision Diagrams, Structured Prediction.',\n",
       "  'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'},\n",
       " {'id': 454846633277740484,\n",
       "  'paper_id': '64a29654d68f896efa29af31',\n",
       "  'paper_title': 'Constraint Reasoning Embedded Structured Prediction.',\n",
       "  'chunk_id': 1,\n",
       "  'chunk_text': '# 1. Introduction\\nThe emergence of large-scale constraint reasoning and machine learning technologies have impacted virtually all application domains, including marketing, linguistics, operations, retail, robotics, and health care. Constraint reasoning has traditionally been applied to building prescriptive models that generate solutions for strategic, tactical, or operational use (Choi et al., 2012). It requires a precise problem description and is usually difficult to be made flexible to the evolving data distributions. Machine learning, on the other hand, has been applied primarily to build predictive models, such as classifications or regressions (Michalski and Anderson, 1984; Bishop, 2007). While the structure of a machine learning model (like a neural network) must be designed, the actual model parameters are learned automatically via gradient descent algorithms. This gives machine learning models the flexibility to adapt to the evolving data distributions. Nevertheless, it is difficult to enforce constraints on the output of machine learning models. Many real-world applications are beyond the reach of constraint reasoning or machine learning alone.  \\n\\n  \\nFigure 1: (a) Our proposed Core-Sp framework embeds constraint reasoning in machine learning for structured prediction. We demonstrate the effectiveness of Core-Sp on vehicle dispatching service, if-then program synthesis, and Text2SQL generation tasks. (b) At a high level, Core-Sp (in orange colored box) is a fully differentiable layer that simulates a path descending in the corresponding decision diagram. Core-Sp filters out the infeasible output from the structured output to ensure constraint satisfaction.  \\n\\nIn this paper, we focus on structured prediction problems, which is a class of learning problems requiring both constraint reasoning and machine learning. It expands the output space of classification problems into high-dimensional structured space. Structured prediction has diverse application domains, ranging from natural language processing (Socher et al., 2013), social network analysis (Xiang and Neville, 2013), and ecological modeling (Tang et al., 2018; Chen et al., 2018). The applications we consider in this paper all require tight integration of constraint reasoning and machine learning. Our first application vehicle dispatching service planning is to recommend a route that satisfies the daily service needs as well as meeting the drivers’ preferences. Historical data may reveal that the drivers do not follow common stylized objectives such as minimizing distance or time. Therefore standard constraint reasoning tools, e.g. , solvers for the traveling salesman problem, cannot be applied. While we need machine learning to capture the drivers’ objective functions, pure machine learning-based approaches are insufficient because they often generate routes that violate delivery requests. Our second and third applications are program synthesis from natural language , which clearly requires machine learning to generate structured programs. Nevertheless, a pure learning approach cannot enforce the syntactic and semantic rules of those programs.  \\n\\nWe propose Co nstraint Re asoning embedded Structured Prediction ( Core-Sp ), a scalable constraint reasoning and machine learning integrated approach for learning over the structured domains. The main idea is to augment structured predictive models with a constraint reasoning module that represents physical and operational requirements. Specifically, we propose to embed decision diagrams (Akers, 1978; Bryant, 1986), a popular constraint reasoning tool, as a fully-differentiable module into deep neural networks. A decision diagram is a compact graphical representation of the constraints. It encodes each solution (an assignment of values to variables satisfying the constraints) as a path from the root to the terminal in the diagram. Core-Sp regards the neural network predictions as the simulation of descending along a path in the decision diagram. To ensure constraint satisfaction, Core-Sp filters out variable assignments from the neural network predictions that violate constraints. With the integration of Core-Sp , we provide structured prediction models with constraint satisfaction assurances. Moreover, structured prediction models with the Core-Sp layer enjoy a smaller prediction space than traditional structured prediction approaches, allowing our approach to learn faster in training and generalize better in testing. See Figure 1(a) for our proposed Core-Sp model which integrates constraint reasoning and machine learning for the three application domains. The high-level idea of Core-Sp is illustrated in Figure 1(b).  \\n\\nPrevious approaches have considered regularizing machine learning with constraint reasoning in various application domains. Within the broader context of learning constrained models, the work of Coletta et al. (2003); Lallouet et al. (2010); Beldiceanu and Simonis (2012); Bessiere et al. (2017); Addi et al. (2018) have studied automating the constraint acquisition process from historic data or (user-)generated queries. These approaches use partial or complete examples to identify the constraints that can be added to the model. The type of constraints that can be learned depends on the formulation. Several works (Punyakanok et al., 2004; Roth and Yih, 2005; Amos and Kolter, 2017; Ferber et al., 2020) enable learning in a constrained domain via encoding mathematical programming, such as quadratic programming or mixed integer linear programming, as a neural network layer. Deutsch et al. (2019) propose to formulate the output space as an automata. They use the constraints to prune all the invalid transitions in the automata to ensure the validity of the structured outputs. In addition, constraints imposed by a knowledge graph have been embedded into the neural network as differentiable layers (Peters et al., 2019; Wu et al., 2017). Zeng et al. (2021) and Heim (2019) enforce physical constraints or expert inputs as soft constraints. We will illustrate the difference between our approach and these methods in Section 3.2. A different approach is to embed a machine learning model into optimization, e.g. , by extending a constraint system with appropriate global constraints. For example, Lallouet and Legtchenko (2007) integrate neural networks and decision trees with constraint programming, while Lombardi et al. (2017) and Lombardi and Gualandi (2016) introduce a “Neuron” global constraint that represents a pre-trained neural network. Another series of approaches based on grammar variational autoencoders (Kusner et al., 2017; Dai et al., 2018; Jin et al., 2018) use neural networks to encode and decode from the parse-tree of a context-free grammar to generate discrete structures. Such approaches are used to generate chemical molecule expressions, which represent a structured domain. Machine learning approaches have also been used to solve constraint reasoning and optimization problems. This includes the works of Galassi et al. (2018) and Vinyals et al. (2015), which use neural networks to extend partial solutions to complete ones. Bello et al. (2017) handle the traveling salesman problem by framing it as reinforcement learning. Selsam et al. (2019) proposes to learn an SAT solver from single-bit supervision. Approaches based on neural Turing machines (Graves et al., 2016) employ neural networks with external memory for discrete structure generation. More recently, Khalil et al. (2017) tackle the combinatorial optimization problems in graphs, by employing neural networks to learn the heuristics in the backtrack-free search. There is also a recent trend to synthesize programs using machine learning (Guu et al., 2017; Shi et al., 2019).  \\n\\nIn experimental analysis, we demonstrate the effectiveness of Core-Sp on the following three applications: (1) Vehicle Dispatching Service Planning : a route planning problem that recommends routes to drivers to meet the service needs while satisfying the drivers’ preferences. The implicit preferences of drivers are learned from the historical traveling data. The input of this problem is the daily service requests. The output is the permutations of the service locations, representing the sequential order that the locations should be visited by the drivers. This task requires machine learning models to capture drivers’ preferences from the traveling data, and constraint reasoning to ensure the satisfaction of service requests. (2) If-then Program Synthesis : the task is to automatically synthesize conditional programs from the natural language. Automatic program synthesis tools are useful to streamline the program of a few online services such as IFTTT and Zapier. The if-then program is in the form of: if trigger function happens in the trigger service , then take the action function from the action service . The machine learning task, therefore, is to predict the quadruple ( trigger service ,trigger function ,action service ,action function ). This application again requires machine learning to understand the semantics of the natural language, as well as constraint reasoning to satisfy the syntactic rules of the programs. (3) Text2SQL Generation : our last application is to automatically generate SQL queries that extract information from a database to answer a question posed in natural language. The neural model is used to understand the user’s queries in natural language while the constraint reasoning tool is applied to ensure the model generates grammaticallyvalid SQL queries.  \\n\\nOur proposed Core-Sp framework demonstrates superior performance against the stateof-the-art approaches in all three applications. First, the structures generated by Core-Sp are better in constraint satisfaction. In vehicle service dispatching, all Core-Sp generated routes are valid, while a conditional generative adversarial network (cGAN) without CoreSp generates on average less than $1\\\\%$ of valid routes when handling medium-sized delivery requests. We also apply a post-processing step (Deudon et al., 2018) to boost cGAN’s performance, but it cannot handle the complexity brought by the large combinatorial space of the routing problem. Its performance quickly defaults to the case without post-processing as the number of delivery locations increases. For if-then program synthesis, the percentage of valid programs produced increased from 88% to 100% with the Core-Sp module incorporated into the state-of-the-art LatentAttention model (Liu et al., 2016). For Text2SQL, the percentage of valid SQL queries increased from 83 .7% to 100% with Core-Sp incorporated into the state-of-the-art SQLNova model (Hwang et al., 2019) on a hard testing set. Core-Sp also improves the learning performance of structured prediction models. We show that the routes generated by Core-Sp better fulfill drivers’ preferences than cGAN without Core-Sp . In if-then program synthesis, Core-Sp module leads to approximately $2.0\\\\%$ improvement in accuracy compared with the state-of-the-art LatentAttention model and converges to models with higher accuracy in fewer training epochs. In Text2SQL generation, the Core-Sp module improves around 4 .2% in execution accuracy and 1 .9% in logical accuracy against SQLNova on a challenging test set.',\n",
       "  'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'},\n",
       " {'id': 454846633313916358,\n",
       "  'paper_id': '64a29654d68f896efa29af31',\n",
       "  'paper_title': 'Constraint Reasoning Embedded Structured Prediction.',\n",
       "  'chunk_id': 2,\n",
       "  'chunk_text': \"# 2.1 Structured Prediction\\nStructured prediction expands the output space of classification problems into a highdimensional combinatorial space (Bakır et al., 2007). Specifically, given a set of inputoutput samples $\\\\mathcal{D}^{t r}\\\\,=\\\\,\\\\{({\\\\boldsymbol{x}}^{(i)},{\\\\boldsymbol{y}}^{(i)})\\\\}_{i=1}^{N}$ drawn i.i.d. from some unknown distribution over the space $\\\\mathcal X\\\\times\\\\mathcal Y$ , a structured pr tion model learns a conditional distribution $p_{\\\\theta}(y|x)$ ,for all $(x,y)\\\\,\\\\in\\\\,\\\\mathcal{X}\\\\times\\\\mathcal{Y}$ from data D$\\\\mathcal{D}^{t r}$ , where $\\\\theta$ denotes the parameters of the structured prediction model. Note that the output space ${\\\\boldsymbol{\\\\mathcal{D}}}=\\\\{0,1\\\\}^{\\\\iota}$ is a high dimensional space of combinatorial structures. The three applications we consider in this paper are all structured prediction problems. In vehicle dispatching service planning, the structured outputs are the delivery routes on a map. In if-then program synthesis, the structured outputs are the programs that complete web-service tasks. In Text2SQL generation, the structured outputs are the SQL queries that follow the SQL grammar.  \\n\\nIn the literature, various approaches have been proposed for structured prediction problems. The classifier chain approach (Read et al., 2015) decomposes the joint likelihood into a product of conditionals and reduces the structured prediction problem into a series of binary prediction problems. In this approach, the error tends to propagate along the classifier chain, which limits its effectiveness (Dembczynski et al., 2010). Energy-based modeling, such as conditional random fields (Lafferty et al., 2001; Geman and Geman, 1984) and structured prediction energy networks (Belanger and McCallum, 2016) learn to assign a high likelihood to structures that exist in the training data set while keeping the likelihood low for unseen structures. Constraints can be incorporated into these models as prior terms in the energy function but approximated inference is required to compute the intractable partition function, which often hinders their scalability. Another line of research uses structured support vector machines (Tsochantaridis et al., 2005), which apply hinge loss and row generation approaches for structured prediction; however, these were superseded in performance by later neural-network-based approaches. Recently, generative models, such as conditional generative adversarial networks (Mirza and Osindero, 2014; Goodfellow et al., 2014), flow models (Rezende and Mohamed, 2015), and sequence-to-sequence models (Sutskever et al., 2014) have become increasingly popular for structured prediction. These models use highly flexible neural networks to increase model capability. The over-parameterized networks with gradient descent-based optimization can learn better representation for the structures than the classic shallow models. However, it is not straightforward to enforce constraints into the neural network-based models.  \\n\\nConstraints in Structured Prediction. Often the structured output space $\\\\mathcal{V}$ is subject to additional constraints $\\\\scriptscriptstyle\\\\mathcal{C}$ . The conditional probability that $y$ takes values that violate the (physical) constraints $\\\\mathcal{C}$ given the input $x$ is zero. Such information is known prior to the training of the machine learning model. Formally, we have:  \\n\\n$$\\np(y|x)\\\\left\\\\{\\\\int>0\\\\quad{\\\\mathrm{if~}}y{\\\\mathrm{~satisfies~}}{\\\\mathcal{C}},\\\\right.\\n$$  \\n\\nTake the first task discussed in this paper as an example. A valid delivery route should cover all the requested locations and should only visit each location once. Thus, the machine learning model should assign zero probability to those invalid routes. Notice that the constraints are often intricate and the inference problem of finding a valid structure satisfying constraints cannot be decomposed into independent small problems. After learning, the inference problem is to predict the structured output $y$ given the input $x$ . Such inference problems can be solved by either Maximum A Posteriori (MAP) inference, e.g. , computing $m a x_{y}\\\\ p(y|x)$ or marginal inference, e.g. , computing $\\\\mathbb{E}_{y}[p(y|x)]$ . Learning structured prediction models involves solving the inference problems within the learning loop, hence having an even higher complexity.  \\n\\nCombinatorial constraints render both the inference and the learning problems highly intractable. Indeed, much effort has been made to improve the efficiency of both the inference and learning problems (Pan and Srikumar, 2018; Bello et al., 2020). For example, Niculae et al. (2018) propose the sparseMAP function which solves the inference problem by returning a few sparse structures that attain high likelihoods. This inference method sits between the MAP and marginal inference. In their problem setup, sparseMAP can be solved via quadratic programming. However, combinatorial constraints considered in this paper make the inference problem non-convex, even for a fixed structured prediction model, let alone the more challenging learning problem. Overall, constrained structured prediction presents two main challenges. The first is the sample complexity , since massive data is needed to learn an accurate model in an exponentially large space. The second is the computational complexity , since it is combinatorially intractable (unless P=NP) to generate structured outputs subject to complicated constraints.  \\n\\nSequence-to-sequence Structured Prediction. Our proposed Core-Sp method is designed to extend sequence-to-sequence models, which are recently proposed popular structured prediction models (Sutskever et al., 2014). The sequence-to-sequence model uses the re-parameterization trick to model the conditional probability $p_{\\\\theta}(y|x)$ , where $x\\\\in\\\\mathscr{X}$ denotes the input variables and $y\\\\in\\\\mathcal{V}$ is the structured output. Here $\\\\theta$ denotes the parameters of the neural model. Instead of modeling the probability $p_{\\\\theta}(y|x)$ directly, the model introduces an additional random variable $\\\\mathcal{Z}$ and models it as a deterministic transformation from random variable $\\\\mathcal{Z}$ and evidence $x$ to the output $y$ . In other words, the conditional probability $p_{\\\\theta}(y|x)$ is an integral over random variable $z$ in the following way:  \\n\\n$$\\n\\\\begin{array}{c}{{p_{\\\\theta}(y|x)=\\\\displaystyle\\\\int p_{\\\\theta}(y|x,z)p(z)\\\\;d z,}}\\\\\\\\ {{p_{\\\\theta}(y|x,z)=\\\\mathbb{1}\\\\{y=f_{\\\\theta}(x,z)\\\\},}}\\\\end{array}\\n$$  \\n\\nwhere we assume $\\\\mathcal{Z}$ is from a known prior probability distribution $p(z)$ . As a result, we only need to model $p_{\\\\theta}(y|x,z)$ for the overall model $p_{\\\\theta}(y|x)$ . We further assume that $p_{\\\\theta}(y|x,z)$ is given in the form of a deterministic function. We let $f_{\\\\theta}(x,z)\\\\in\\\\mathcal{D}$ be a deterministic mapping from inputs $(x,z)$ to an output in the structured space $\\\\boldsymbol{y}$ . The indicator function $\\\\mathbb{I}\\\\{\\\\cdot\\\\}$ evaluates to $1$ if and only if $y=f_{\\\\boldsymbol{\\\\theta}}(x,z)$ . This formulation is closely related to the generative adversarial network and gives us high flexibility to model multi-modal distributions. Take the vehicle dispatching service planning as an example. The input $x$ is the daily service requests and $y$ is the suggested dispatching route. There can be several routes that meet the service demands and satisfy the driver’s underlying preference function. In this case, the conditional probability $p_{\\\\theta}(y|x)$ may have multiple modes, one for each good route. This formulation allows us to represent the multi-modal distribution effectively. The variable $z$ decides which route to pick. The function $f_{\\\\boldsymbol{\\\\theta}}(x,z)$ returns one route that meets the demand of input $x$ and is randomly selected by $\\\\mathcal{Z}$ . If $p_{\\\\theta}(y|x)$ has $k$ modes, the space of $z$ will be split into $k$ regions where variable $z$ in every region will be mapped to one mode in $p_{\\\\theta}(y|x)$ .  \\n\\nWe use a sequence-to-sequence neural network to model the function $f_{\\\\theta}(x,z)$ . Assume the input variables $x,\\\\ z$ , and the output $y$ are all represented in sequential forms $x=$ $(x_{1},x_{2},\\\\ldots,x_{T})$ ,$z\\\\,=\\\\,(z_{1},z_{2},\\\\dots,z_{T})$ and $y\\\\,=\\\\,(y_{1},y_{2},\\\\ldots,y_{T})$ .The sequence-to-sequence model is made of an encoder and a decoder. The sequential encoder receives $x$ and outputs a representation vector for input $x$ .The sequential decoder receives the output of the encoder as well as $z$ and outputs $y$ in $T$ steps, where $T$ refers to the maximum length for variable $y$ . In the $k$ -th step ( $1\\\\leq k\\\\leq T$ ), the decoder network takes $z_{k}$ , and the hidden vector $h_{k-1}$ from the previous step as inputs, and outputs a score vector $o_{k}=(o_{k1},o_{k2},\\\\dots,o_{k D_{k}})$ of length $D_{k}\\\\;=\\\\;|D(y_{k})|$ .Here, $o_{k}$ corresponds to the un-normalized likelihoods of each value that variable $y_{k}$ can take. The softmax function is then applied to get the normalized probability:  \\n\\n$$\\np_{k j}=p\\\\left(y_{k}=v_{j}|x,h_{k-1}\\\\right)={\\\\frac{\\\\exp(o_{k j})}{\\\\sum_{j^{\\\\prime}=1}^{D_{k}}\\\\exp(o_{k j^{\\\\prime}})}},\\\\qquad{\\\\mathrm{for~}}j=1,2,\\\\ldots,D_{k}.\\n$$  \\n\\n$p_{k j}$ is the probability that variable $y_{k}$ takes the $j$ -th value $v_{j}$ . Assume the prior distribution $p(z_{k})$ is the uniform distribution in $(0,1)$ , denoted by $\\\\mathcal{U}(0,1)$ . Variable $z_{k}$ is sampled from $\\\\mathcal{U}(0,1)$ and is used to determine the value for $y_{k}$ according to the probability distribution vector $p_{k}=(p_{k1},p_{k2},...\\\\,,p_{k D_{k}})$ . Let $P_{k1},P_{k2},...,P_{k(D_{k}+1)}$ be the cumulative probabilities:  \\n\\n$$\\nP_{k j}=\\\\left\\\\{\\\\!\\\\!\\\\begin{array}{l l}{0}&{\\\\mathrm{for~}j=1,}\\\\\\\\ {\\\\sum_{j^{\\\\prime}=1}^{j-1}p_{k j^{\\\\prime}}}&{\\\\mathrm{for~}j=2,3,...\\\\,,D_{k},}\\\\\\\\ {1}&{\\\\mathrm{for~}j=D_{k}+1.}\\\\end{array}\\\\!\\\\!\\\\right.\\n$$  \\n\\n$y_{k}$ is set to t e$v_{j}$ if and only if $z_{k}\\\\ \\\\in\\\\ \\\\left[P_{k j},P_{k(j+1)}\\\\right)$ '\\x01.Notice that because $z_{k}$ is sampled from U$\\\\mathcal{U}(0,1)$ 1), the probability that $y_{k}$ takes the $j$ -th value $v_{j}$ is exactly $p_{k j}$ . Aside from producing the value for $y_{k}$ in the $k$ -th step, the sequence-to-sequence neural net also produces the hidden-state vector $h_{k}$ at the $k$ -th step, which is used by the neural net again in the subsequent $(k+1)$ -th step. The overall architecture of the sequence-to-sequence model can be seen in Figure 4.  \\n\\nThe training process of the sequence-to-sequence model is to minimize a pre-defined loss function, or an additional discriminator neural net, which penalizes the differences of the predicted structure $f_{\\\\theta}(x,z)$ and the observed structure $y$ . Here $f_{\\\\theta}(x,z)$ is a predicted sequence obtained from the above process. Given a training data set $\\\\mathcal{D}^{t r}=\\\\{(\\\\boldsymbol{x}^{(i)},\\\\boldsymbol{y}^{(i)})\\\\}_{i=1}^{N}$ ,the learning objective is to minimize the loss function:  \\n\\n  \\nFigure 2: Illustration of Multi-valued Decision Diagrams (MDDs) for decision variables $x_{1},x_{2},x_{3}$ .(a) An exact MDD with all variable assignments satisfying two constraints: all-diff $(x_{1},x_{2},x_{3})$ and $x_{1}\\\\neq v_{1}$ .(b) A width-1 relaxed MDD for the exact MDD in (a). (c) A width-2 relaxed MDD, which is formed by combining nodes $u_{4}$ and $u_{5}$ of the MDD in (a).  \\n\\n$$\\n\\\\mathcal{L}(\\\\theta)=\\\\frac{1}{N}\\\\sum_{i=1}^{N}\\\\mathbb{E}_{z^{(i)}}\\\\left[\\\\ell\\\\left(f_{\\\\theta}\\\\left(x^{(i)},z^{(i)}\\\\right),y^{(i)}\\\\right)\\\\right].\\n$$  \\n\\nHere $\\\\ell(\\\\cdot,\\\\cdot)$ can be a predefined loss function that measures the mismatch between the predicted and observed structures. Function $\\\\ell(\\\\cdot,\\\\cdot)$ can also be represented as a discriminator network, which leads to the training of a generative adversarial network. The parameters $\\\\theta$ are updated via gradient descent, i.e. ,$\\\\theta^{t+1}=\\\\theta^{t}{-}\\\\eta\\\\nabla{\\\\mathcal{L}}(\\\\theta)$ , where $\\\\eta$ denotes the learning rate.\",\n",
       "  'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'},\n",
       " {'id': 454846633347470792,\n",
       "  'paper_id': '64a29654d68f896efa29af31',\n",
       "  'paper_title': 'Constraint Reasoning Embedded Structured Prediction.',\n",
       "  'chunk_id': 3,\n",
       "  'chunk_text': '# 2.2 Decision Diagrams\\nDecision diagrams were originally introduced to compactly represent Boolean functions in a graphical form (Akers, 1978; Bryant, 1986). Since then, they have been widely used in the context of verification and configuration problems (Wegener, 2000). More recently, they have been used successfully as an optimization tool, by representing the set of solutions to combinatorial optimization problems (Bergman et al., 2016b; van Hoeve, 2022).  \\n\\nDecision diagrams are defined with respect to a sequence of decision variables $x_{1},\\\\ldots,x_{n}$ .Variable $x_{i}$ has a domain of possible values $D(x_{i})$ , for $i=1,2,\\\\dots,n$ . A decision diagram is a directed acyclic graph, with $n+1$ layers of nodes. Layer 1 contains a single node $s$ ,called the root. Layer $n+1$ also contains a single node $t$ , called the terminal. An arc from a node in layer $i$ to a node in layer $i+1$ represents a possible assignment of variable $x_{i}$ to a value in its domain and is therefore associated with a label in $D(x_{i})$ . For an arc $e(v,u)$ ,we use ${\\\\tt v a l}(v,u)\\\\,\\\\in\\\\,D(x_{i})$ to represent the assigned label for variable $x_{i}$ . For a node $\\\\upsilon$ in layer $i$ , we use $\\\\mathtt{v a l}(v)\\\\subseteq D(x_{i})$ to represent the union of the values of each arc starting from node $v$ ,i.e. ,${\\\\mathsf{v a l}}(v)\\\\,=\\\\,\\\\cup_{e(v,u)}\\\\{{\\\\mathsf{v a l}}(v,u)\\\\}$ . In other words, ${\\\\tt v a l}(v)$ represents the possible value assignments for the decision variable $x_{i}$ at node $\\\\upsilon$ . Each path from the root $s$ to the terminal $t$ represents a solution, i.e. , a complete variable assignment. In this paper, we consider variables with domains of categorical values, which result in so-called multi-valued decision diagrams (MDDs) (Wegener, 2000). See Figure 2 for an example.  \\n\\n  \\nFigure 3: Node splitting and arc filtering for MDDs for variables $x_{1},x_{2},x_{3}$ .(a) A width-1 relaxed MDD as in Figure 2(b). (b) Split node $u_{1}$ into $\\\\hat{u}_{1}$ and $\\\\tilde{u}_{1}$ .(c) Filter arcs $e(\\\\hat{u}_{1},u_{2})\\\\;=\\\\;v_{2},e(\\\\tilde{u}_{1},u_{2})\\\\;=\\\\;v_{3}$ that violate the constraint all-diff $(x_{1},x_{2},x_{3})$ .The arcs in dashed lines are removed. (d) A width-2 relaxed MDD after one iteration of node splitting and arc filtering.  \\n\\nExact Decision Diagrams. Given a set of constraints $\\\\mathcal{C}$ , the MDD $\\\\mathcal{M}$ is said to be exact with respect to $\\\\mathcal{C}$ if and only if every path that leads from the root node $s$ to the terminal node $t$ in $\\\\mathcal{M}$ is a variable assignment satisfying all constraints in $\\\\scriptscriptstyle\\\\mathcal{C}$ . Conversely, every valid variable assignment can be found as a path from $s$ to $t$ in $\\\\mathcal{M}$ .  \\n\\nRelaxed Decision Diagrams. Since exact decision diagrams can grow exponentially large, relaxed decision diagrams were introduced to limit their size (Andersen et al., 2007). The set of paths in a relaxed decision diagram forms a superset of the paths in the associated exact decision diagram. Relaxed MDDs are often defined with respect to the maximum layer width, which is the number of nodes in its largest layer.  \\n\\nVariable Ordering. In general, the size of an exact decision diagram is known to strongly depend on the variable ordering (Friedman and Supowit, 1990). In our applications, however, we consider sequential decision processes which follow a natural prescribed ordering. Our approach can also be applied to more general decision problems, in which case the variable ordering needs to be considered when compiling the MDD.  \\n\\nExample 1 Figure $\\\\mathcal{Q}$ demonstrates several MDDs. Let $x_{1},x_{2},x_{3}$ be a sequence of decision variables with domain $D(x_{1})\\\\ =\\\\ D(x_{2})\\\\ =\\\\ D(x_{3})\\\\ =\\\\ \\\\{v_{1},v_{2},v_{3}\\\\}$ .The constraint $a\\\\,\\\\!\\\\ l\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\!\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,\\\\,$ restricts the values of $x_{1},x_{2}$ and $x3$ to be all different, i.e., they form a permutation. The other constraint is $x_{1}\\\\neq v_{1}$ . (1) Exact MDD. The set of feasible permutations is $\\\\{(v_{2},v_{1},v_{3})$ ,$(v_{2},v_{3},v_{1})$ ,$(v_{3},v_{2},v_{1})$ ,$(v_{3},v_{1},v_{2})\\\\}$ . Figure 2(a) depicts the exact MDD that encodes all permutations satisfying the two constraints. (2) Relaxed MDD. Figure 2(b) is a width-1 relaxed MDD and Figure $\\\\mathcal{Q}(c)$ is a width$\\\\it{2}$ relaxed MDD. The set of paths in the relaxed MDD forms a superset of all feasible permutations. As an illustration, Figure 2(c) contains two infeasible solutions $\\\\{(v_{3},v_{1},v_{1}),(v_{2},v_{2},v_{2})\\\\}$ . (3) Variable ordering. All the MDDs in Figure 2 have the same variable ordering of $\\\\pi=(1,2,3)$ , meaning that the MDD first expands on variable $x_{1}$ , then $x_{2}$ , finally $x_{3}$ .  \\n\\nDecision Diagram Compilation. Decision diagrams can be compiled via a repeated process of node splitting and arc filtering from a width-1 relaxed MDD (Andersen et al., 2007; Bergman et al., 2016a). Arc filtering removes arcs that lead to infeasible solutions, while node splitting increases the size of the decision diagram by splitting one node into two or more nodes. In practice, one can reach an exact MDD by repeatedly going through the splitting and filtering processes from a width-1 MDD. We refer to Cir´e and van Hoeve (2013) for the detailed process of MDD compilation for sequential decision problems.  \\n\\nExample 2 Figure 3 demonstrates one possible process of applying the node splitting and arc filtering steps. We re-use the example in Figure $\\\\mathcal{Q}(b)$ as the initial MDD in Figure $\\\\mathcal{Y}(a)$ ,which depicts a width-1 relaxed MDD before compilation. The constraint to be applied is $a\\\\o{l}\\\\,\\\\o{l}-d i\\\\,\\\\b{f}\\\\b{f}(x_{1},x_{2},x_{3})$ , i.e., the assignments of variables $x_{1},x_{2},x_{3}$ should be pairwise different. The node $u_{1}$ in Figure $\\\\mathcal{Y}(a)$ is split into two nodes $\\\\hat{u}_{1},\\\\tilde{u}_{1}$ in Figure $\\\\mathcal{Y}(b)$ . The incoming arc $e(s,u_{1})$ with labe $v_{2}$ is assigned to node $\\\\hat{u}_{1}$ and the other incoming arc $e(s,u_{1})$ with label $v_{3}$ is assigned to node $\\\\tilde{u}_{1}$ . The outgoing arcs of node $u_{1}$ are copied for the two nodes. In Figure $\\\\mathcal{Y}(c)$ , the arc filtering process checks if certain variable assignments violate constraints for the two nodes. Arc $e(\\\\hat{u}_{1},u_{2})=v_{2}$ is not compatible with the previous arc $e(s,\\\\hat{u}_{1})$ with label $v_{2}$ because it violates $a\\\\o{l}\\\\,\\\\o{l}-d i\\\\,\\\\b{f}\\\\b{f}(x_{1},x_{2},x_{3})$ . Thus it is removed. For the same reason, arc $e(\\\\tilde{u}_{1},u_{2})=v_{3}$ is also removed. (d) We get a width$\\\\boldsymbol{\\\\mathscr{Q}}$ relaxed MDD after splitting node $u_{1}$ and filtering the arcs.',\n",
       "  'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'},\n",
       " {'id': 454846633380500938,\n",
       "  'paper_id': '64a29654d68f896efa29af31',\n",
       "  'paper_title': 'Constraint Reasoning Embedded Structured Prediction.',\n",
       "  'chunk_id': 4,\n",
       "  'chunk_text': '# 3. Constraint Reasoning Embedded Structured Prediction\\nCore-Sp is motivated by the lack of constraint satisfaction in sequence-to-sequence structured prediction models. The key idea of Core-Sp is the correspondence between the predicted outcomes of a sequence-to-sequence model and a path in a multi-valued decision diagram (MDD). Figure 4 provides an example. In this example, the sequence-to-sequence model outputs a sequence of variable assignments $y_{1}=v_{2}$ ,$y_{2}=v_{3}$ ,$y_{3}=v_{1}$ in Figure 4(a), which exactly corresponds to the highlighted blue path in the MDD in Figure 4(b). However, the sequence-to-sequence model is also likely to output a variable assignment with no correspondence to the MDD. For example, if the neural model in Figure 4(a) outputs $y_{1}\\\\,=\\\\,v_{2}$ ,$y_{2}~=~v_{3}$ ,$y_{3}\\\\,=\\\\,v_{2}$ , there is no corresponding path in the MDD in Figure 4(b). This illustrates the case where the output of the sequence-to-sequence model violates the all-diff constraint. Indeed, neural network-based models for structured prediction problems are not guaranteed to satisfy constraints as defined in Equation (1), which forms a key limitation of state-of-the-art structured prediction models.  \\n\\nCore-Sp ensures constraint satisfaction of the neural network prediction by limiting the values that each variable can take following the flow of the MDD. Suppose we set $y_{1}=v_{2}$ Figure 4: Illustration of (a) a sequence-to-sequence model which generates an output corresponding to (b) a path in the multi-valued decision diagram. (a) A sequenceto-sequence model receives input $x$ and random variables $\\\\mathcal{Z}$ , and outputs $y_{1}=v_{2}$ ,$y_{2}=v_{3}$ and $y_{3}=v_{1}$ in three steps. (b) The assignment $\\\\left(y_{1},y_{2},y_{3}\\\\right)=\\\\left(v_{2},v_{3},v_{1}\\\\right)$ corresponds to path $s\\\\;{\\\\xrightarrow{v_{2}}}\\\\;u_{1}\\\\;{\\\\xrightarrow{v_{3}}}\\\\;u_{4}\\\\;{\\\\xrightarrow{v_{1}}}\\\\;t$ −→ −→ −→ in the multi-valued decision diagram.  \\n\\n  \\n\\nand $y_{2}=v_{3}$ in Figure 4(b) and arrive at node $u_{4}$ , the only valid option for $y3$ is to set $y_{3}=v_{1}$ . The other options $y_{3}=v_{2}$ or $y_{3}=v_{3}$ lead to constraint violations. Hence CoreSp masks out the choices of $y_{3}=v_{2}$ and $y_{3}=v_{3}$ for the sequence-to-sequence model. In this way, Core-Sp addresses a key limitation of structured prediction models. We next provide the details of Core-Sp .',\n",
       "  'original_filename': 'Journal_Paper_Meta_Data_Journal_of_Machine_Learning_Research_with_whole_text.db'}]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
